diff --git a/.github/workflows/clang-format-check-cn.yml b/.github/workflows/clang-format-check-cn.yml deleted file mode 100644 index 74c03301ff2..00000000000 --- a/.github/workflows/clang-format-check-cn.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: clang-format-cn Check -on: workflow_dispatch -jobs: - formatting-check: - name: Formatting Check - runs-on: ubuntu-latest - strategy: - matrix: - path: - - 'prov/opx' - steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - - name: Run clang-format style check for C/C++/Protobuf programs (Cornelis Networks-specific). - uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 - with: - clang-format-version: '15' - check-path: ${{ matrix.path }} diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 3e3eb43755d..c566599ccbe 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,9 +11,9 @@ jobs: path: - 'prov/sm2' steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run clang-format style check for C/C++/Protobuf programs. - uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 + uses: jidicula/clang-format-action@d05cecd4a1a5b7e64c22f5a468456135a43f13f6 # v4.14.0 with: clang-format-version: '15' check-path: ${{ matrix.path }} diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml new file mode 100644 index 00000000000..c54d8b82af6 --- /dev/null +++ b/.github/workflows/cn.yml @@ -0,0 +1,19 @@ +name: 'Cornelis' + +on: + workflow_dispatch: + pull_request: + types: + - opened + - reopened + - synchronize + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + opx-ci: + name: CI + if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' }} + uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 21260a33bc0..5979537be95 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -48,11 +48,11 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/init@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/autobuild@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/analyze@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 73f34bb228a..a448a906d11 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -52,7 +52,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Coverity tools run: | wget https://scan.coverity.com/download/linux64 --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=ofiwg%2Flibfabric" -O coverity_tool.tgz @@ -94,7 +94,7 @@ jobs: --form description="`$PWD/install/bin/fi_info -l`" \ https://scan.coverity.com/builds?project=ofiwg%2Flibfabric - name: Upload build logs - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: coverity-build-log.txt path: cov-int/build-log.txt diff --git a/.github/workflows/gh-man.yaml b/.github/workflows/gh-man.yaml index 5400ec94dd2..44ad72bb2ca 100644 --- a/.github/workflows/gh-man.yaml +++ b/.github/workflows/gh-man.yaml @@ -25,7 +25,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Update the man pages in branch gh-pages run: .github/workflows/gh-man.sh diff --git a/.github/workflows/nroff-elves.yaml b/.github/workflows/nroff-elves.yaml index 669a06bb4dd..d8dea720789 100644 --- a/.github/workflows/nroff-elves.yaml +++ b/.github/workflows/nroff-elves.yaml @@ -23,7 +23,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Get the required packages run: sudo apt install -y pandoc diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 5d3e3b5f886..bf325eee07c 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -56,7 +56,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build Check run: | set -x @@ -70,7 +70,7 @@ jobs: $PWD/install/bin/fi_info -l - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: ${{ matrix.os }}-${{ matrix.cc }}-config.log path: config.log @@ -96,7 +96,7 @@ jobs: sudo apt-add-repository 'deb [arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main' sudo apt-get update sudo apt-get install -y level-zero level-zero-dev - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: HMEM Checks run: | set -x @@ -115,18 +115,18 @@ jobs: $PWD/install/bin/fi_info -c FI_HMEM - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: hmem-config.log path: config.log macos: - runs-on: macos-12 + runs-on: macos-13 steps: - name: Install dependencies (Mac OS) run: | brew install automake - brew install libtool - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + brew install --quiet libtool + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build Check run: | ./autogen.sh @@ -139,7 +139,7 @@ jobs: make -j2 - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: - name: macos-12-config.log + name: macos-config.log path: config.log diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index aee05e7af95..8fa7a945e07 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -33,7 +33,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false @@ -60,7 +60,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: SARIF file path: results.sarif @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/upload-sarif@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: sarif_file: results.sarif diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 9f8db3dcdc7..4c169cb8f64 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,7 @@ jobs: pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-stale: 360 diff --git a/AUTHORS b/AUTHORS index 6efa2e1831c..c8068cc5361 100644 --- a/AUTHORS +++ b/AUTHORS @@ -58,6 +58,7 @@ Chenwei Zhang Chien Tin Tung Chris Dolan Chris Taylor +Chuck Fossen Chuck Fossen Chuck Fossen Cody Mann @@ -76,6 +77,7 @@ Dmitry Durnov Dmitry Gladkov Doug Oucharek Edgar Gabriel +Elias Kozah Elias Kozah Elias Kozah Eric Raut @@ -108,6 +110,7 @@ Ignacio Hernandez Ira Weiny Itai Masuari iziemba <57813515+iziemba@users.noreply.github.com> +Jack Morrison Jaime Arteaga James Dinan James Shimek @@ -138,6 +141,7 @@ Joe Nemeth Johannes Ziegenbalg John Biddiscombe John Byrne +John Heemstra Jonathan Behrens Jorge Cabrera jose @@ -188,6 +192,7 @@ Neil Spruit Nicholas Sielicki Nicolas Morey-Chaisemartin Nikhil Nanal +nikhil nanal nikhilnanal nikhilnanal nikhilnanal @@ -197,6 +202,7 @@ Noam Beer Oblomov, Sergey Oblomov, Sergey OFIWG Bot +Olga Weiss Olivier Serres orbea Paolo Inaudi @@ -227,6 +233,7 @@ Robert Wespetal Rohit Zambre Ryan Hankins Ryan Hankins +Ryan Hankins Rémi Dehenne Sai Sunku Sannikov, Alexander @@ -283,6 +290,7 @@ Wenduo Wang wenduwan wenduwan Wesley Bland +wey William Zhang Xuezhao Liu Xuyang Wang @@ -298,4 +306,5 @@ Zach Tiffany zdworkin Zhaojuan Guo zhngaj +Zhuo Zhi ztaylor diff --git a/Makefile.am b/Makefile.am index 00242c7d65e..f91f3d1f265 100644 --- a/Makefile.am +++ b/Makefile.am @@ -91,6 +91,9 @@ common_srcs = \ prov/util/src/rocr_ipc_monitor.c \ prov/util/src/ze_ipc_monitor.c \ prov/util/src/xpmem_monitor.c \ + prov/util/src/kdreg2_mem_monitor.c \ + prov/util/src/uffd_mem_monitor.c \ + prov/util/src/import_mem_monitor.c \ prov/util/src/util_profile.c \ prov/coll/src/coll_attr.c \ prov/coll/src/coll_av.c \ @@ -222,7 +225,7 @@ src_libfabric_la_LIBADD = src_libfabric_la_DEPENDENCIES = libfabric.map if !EMBEDDED -src_libfabric_la_LDFLAGS += -version-info 2:0:0 +src_libfabric_la_LDFLAGS += -version-info 27:0:26 endif src_libfabric_la_LDFLAGS += -export-dynamic \ $(libfabric_version_script) @@ -450,9 +453,6 @@ dist-hook: libfabric.spec cp libfabric.spec $(distdir) perl $(top_srcdir)/config/distscript.pl "$(distdir)" "$(PACKAGE_VERSION)" -install-exec-hook: - ln -sf libfabric.so.2 $(DESTDIR)$(libdir)/libfabric.so.1 - TESTS = \ util/fi_info @@ -484,6 +484,7 @@ include prov/sm2/Makefile.include include prov/tcp/Makefile.include include prov/ucx/Makefile.include include prov/lpp/Makefile.include +include prov/lnx/Makefile.include include prov/hook/Makefile.include include prov/hook/perf/Makefile.include include prov/hook/trace/Makefile.include diff --git a/NEWS.md b/NEWS.md index c257d7b5dbc..0d766534d3d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,285 @@ bug fixes (and other actions) for each version of Libfabric since version 1.0. New major releases include all fixes from minor releases with earlier release dates. +v2.0.0, Fri Dec 13, 2024 +======================== + +## Core + +- xpmem: Cleanup xpmem before monitors +- Remove redundant windows.h +- hmem/cuda: Add env variable to enable/disable CUDA DMABUF +- Update ofi_vrb_speed + +## CXI + +- Add FI_OPT_CUDA_API_PERMITTED tests +- Define FI_CXI_FORCE_DEV_REG_COPY +- Support FI_OPT_CUDA_API_PERMITTED +- Testing FI_RM_ENABLED +- Correct checking of MR test rc +- Update unit test for collectives +- Add test for invalid client RKEY +- Fix broken client key check +- Ignore FLT_OVERFLOW and FLT_INVALID errors +- Update CXI man page. +- Enable dmabuf for ROCR by default. +- Remove disable_dmabuf_cuda and disable_dmabuf_rocr +- Disable use of dmabuf by default for cuda +- Remove use of deprecated FI_ORDER_NONE +- Report RMA order used in debug output +- Remove srx unittests +- Add FI_PEER capability bit +- Support shared receive queues +- Implement shared Completion Queues + +## EFA + +- Add tracepoints for rma operations +- Adjust the location of tracepoint +- Implement the rma interface +- Fix efa_msg flags +- Remove efa_send_wr, send_wr_pool and recv_wr_pool from dgram_ep +- Fix the read_bad_recv_status unit test +- Implement efa_msg interface +- Implement FI_MORE for fi_recv in zero copy recv mode +- Fix the error path of zero copy recv +- Move inject sizes from rdm ep to base ep +- Fix the ep list scan in cq/cntr read +- Fix the error handling for unsolicited recv +- Fall back to zero sl when non-zero sl qp creation failed +- Disable zero copy receive if p2p is not available +- Initialize efa fork support in EFA_INI +- Update efa_hmem and efa_fork_support log to FI_LOG_CORE +- Make efa_hmem_info a global variable +- Set max rma order size correctly + +## Hook + +Fix the preprocessor + +## LNX + +- Fix av strncpy +- Fix various issues with initial commit + +## SHM + +- Cleanup op flags + +## Sockets + +- Fixed coverity issue for unchecked return value. + +## Util + +- Set srx completion flags and msg_len properly +- fi_pingpong: Fix coverity issue about integer overflow + +## Verbs + +- Fix coverity issue about overflowed return value +- Enable implicit dmabuf mr reg for more HMEM ifaces + +## Fabtests + +- Add FI_MORE pytest for fi_recv in zcpy recv mode +- Allow tests with FI_MORE flag by using fi_recvmsg +- New fabtest fi_flood to test over subscription of resources +- test_configs/ofi_rxm/tcp.test: remove cntr RMA testing +- Fix compiler warning about unitialized variable + + +v2.0.0 beta, Fri Oct 25, 2024 +============================== + +## Core + +- xpmem: Fix compilation warning +- Change the xpmem log level to info +- Clarify FI_HMEM support of inject calls +- Introduce Sub-MR +- Define capbility for directed receive without wildcard src_addr +- Define capability for tagged message only directed recv +- Define capability bit for tagged multi receive +- Define flag for single use MR +- Move flags only used for memory registration calls to fi_domain.h +- windows/osd.h: fix and refactor logical operations on complex numbers +- man/fi_peer: update peer fid initialization language +- Remove CURRENT_SYMVER() macro +- 1.8 ABI compat + +## CXI + +- Update provider man page +- Update version to 2.0 +- Remove setting total_buffered_recv +- Update CXI provider + +## EFA + +- Remove unused fields from various data structures +- Update efa shm implementation to allocate fi_peer_srx_context +- Avoid gdr_pin/gdr_map for dmabuf mrs +- Only do dmabuf reg when FI_MR_DMABUF is set +- Report correct inject_msg_size for zcpy rx +- Add setopt/getopt support for remaining EP sizes +- Split RDM EP inject size field into MSG,RMA variants +- Use tclass to prioritize the messages from an ep +- Remove tx_size and rx_size from efa_rdm_ep +- Remove tx_iov_limit and rx_iov_limit from efa_rdm_ep +- Remove DC NACK packet from rxe map after recv completed +- Correctly handle fallback longcts-rtw send completion +- Differentiate unresponsive receiver errors following rdma-core +- Make NACK protocol fall back to DC longCTS when DC is requested +- Update help message for inter_min_read_write_size +- Adjust log level for setopt/getopt +- Add dependency header file in fi_ext_efa.h +- Test: Disable shm via fi_setopt +- Rename p2p_available to mr_p2p_available +- Always use p2p for system memory +- Test: Use correct qp num in the mock +- Shrink the size of extra_info array +- Improve the zero-copy recv error message. +- Update read nack protocol docs +- Receiver send NACK if p2p is unavailable +- Sender switch to emulated long CTS write if p2p unavailable +- Adjust log level for shm disabling. +- Check p2p support to use rdma read +- Add device to host copy for inject rdma write +- Copy user buffer for fi_sendmsg with FI_INJECT +- Respect FI_MR_LOCAL in transport path + +## HOOK + +- Trace: Add trace log for domain_attr. + +## LNX + +- Initial addition + +## OPX + +- Use page_sizes[OFI_PAGE_SIZE] instead of PAGE_SIZE +- Set immediate ACK requested bit when sending last packet of RMA PUT +- Add debug check for zero-byte length data packets +- Conditionally set FI_REMOTE_CQ_DATA on receive +- Include less immediate data in RTS packet to improve rendezvous performance +- Investigate and address indeterminate behavior or segfault resulting from ignored context creation error +- fi_info -e fix for FI_OPX_UUID env var +- Fix last_bytes field for replay over sdma +- Fix eager and mp eager +- Fix payload copy +- Add FI_OPX_TID_MIN_PAYLOAD_BYTES param +- Fix incorrect calculation of immediate block offset in send rendezvous +- Initialize nic info in fi_info +- Simplify fi_opx_check_rma() function. +- added OPX Tracer points to RMA code paths +- Fix credit return +- Remove polling call from internal rma write +- Support 16B SDMA CTS work +- Fix uepkt 16B headers +- 16B SDMA header support +- Man: Document OPX max ping envvars +- Link bounce support for OPX WFR +- Scb/hdr changes +- Updated configure.m4 for ROCR +- Capitalized env var used for production override, also added opx to the front. +- Remove FI_CONTEXT2 requirement +- Only posting one completion for rzv truncation receives. +- Fixing bug for credit check in inject code path. +- Resolve coverity scan defects uncovered after upstream +- Replace fi_opx_context_slist with slist +- Remove assert from find pkt by tag +- Add OPX Tracer EP lock and Recv entries +- CN5000/JKR: Changes needed to get RMA working in 16B +- Added GDRCopy logging and failure path +- Initial 16B header support +- Fix wrong function used when copying from HMEM/rocr. +- Create GPU-specific SDMA/RZV thresholds +- Don't try to get HMEM iface for NULL pointers +- Limit the number of reliability pings on credit-constrained flows +- Remove function table entries for reliability types other than ONLOAD + +## PSM3 + +- Fix logical atomic function calls +- Check atomic op error code +- Disable complex comparison combinations + +## SHM + +- Add unmap_region function +- Use owner-allocated srx +- Fix incorrect capability set +- Make progress errors ints instead of unit64 +- Remove unused err path from progress_iov +- Refactor initialization process +- Put smr_map memory into av + +## TCP + +- Fix incorrect usage of av insert apis when multiplexing +- Initialize addr_size when duplicating an av + +## Util + +- Change uffd stop routine to use pipe +- Integrate kdreg2 into libfabric +- mr_cache: Support compile default monitor +- Handle page faults in uffd monitor +- Allow providers to update cache MR IOV +- Log AV insert with AV's specified address format +- Add uffd user mode flag for kernels + +## Fabtests + +- Fix compilation error about CMPLX with C99 +- Added -E/env option to multinode test script +- Change xfer-method variable to xfer_method in runmultinode.sh +- Fix complex fill cast +- efa: Remove rnr cq error message check +- efa: Loose assertion for read request counters +- runfabtests.cmd: add atomic tests to windows testing +- runfabtests.sh: add rdm_atomic validation tests +- rdm_atomic: add data validation +- Change ZE memset to use uint8 +- Change sync message to be 0 bytes instead of 1 byte +- Fix atomic buffer +- Add hmem support to common atomic validation +- Move ubertest atomic validation code to common +- Use new synapse api +- Update fi_multinode test +- Update runmultinode.py with args +- Added inband sync to ft_init_fabric_cm +- lpp: remove deprecated FI_MR_BASIC +- Add option for conditionally building lpp +- Make building efa conditional +- Call provider specific configure +- efa: Skip inter_min_write_write_size test when rdma write is on +- efa: Add efa_rdma_checker +- lpp: remove invalid condition in fi_tsenddata +- Support no prepost RX pingpong test +- Split out ft_sync logic +- Define common run pingpong function +- Move pingpong logic into pre-posted func +- lpp: update version and protocol in fi_getinfo +- lpp: fix compile warnings +- Remove multi_ep from tcp exclude +- runfabtests.sh: add more multi_ep tests +- Add common threading option +- multi_ep: use common long ops, switch shared-av and cq opts +- multi_ep: add closing and reopening of MRs +- multi_ep: add RMA validation +- Create common raw key functions +- multi_ep: separate MR resources per EP +- efa: Skip memory registration that hit device limit +- efa: Avoid testing duplicate mixed memory type workload +- lpp: Fix compiler warning about unused variables +- Remove deprecated MR modes +- Remove fi_poll and fi_dgram_waitset tests (deprecated feature) + + v2.0.0 alpha, Fri Aug 30, 2024 ============================== diff --git a/configure.ac b/configure.ac index e56e370ee7a..2c5f194e6f9 100644 --- a/configure.ac +++ b/configure.ac @@ -2,14 +2,14 @@ dnl dnl Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2019-2021 Intel, Inc. All rights reserved. dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. -dnl (C) Copyright 2020 Hewlett Packard Enterprise Development LP +dnl (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP dnl Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved. dnl Copyright (c) 2023 Tactical Computing Labs, LLC. All rights reserved. dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [2.0.0b1], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [2.1.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) @@ -557,6 +557,37 @@ AS_IF([test $have_uffd -eq 1], AC_DEFINE_UNQUOTED([HAVE_UFFD_UNMAP], [$have_uffd], [Define to 1 if platform supports userfault fd unmap]) +dnl Check uffd thread id support +have_uffd_thread_id=0 +AS_IF([test $have_uffd -eq 1], + [AC_MSG_CHECKING([for userfaultfd thread id support]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + #include + #include + #include + #include + #include + ]], + [[ + int fd; + struct uffdio_api api_obj; + api_obj.api = UFFD_API; + api_obj.features = UFFD_FEATURE_THREAD_ID | + UFFD_FEATURE_EVENT_UNMAP | + UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_EVENT_REMAP; + fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + return ioctl(fd, UFFDIO_API, &api_obj); + ]]) + ], + [AC_MSG_RESULT([yes]) + have_uffd_thread_id=1], + [AC_MSG_RESULT([no])])]) + +AC_DEFINE_UNQUOTED([HAVE_UFFD_THREAD_ID], [$have_uffd_thread_id], + [Define to 1 if platform supports userfault fd thread id]) + dnl restricted DL open restricted_dl=0 AC_ARG_ENABLE([restricted_dl], @@ -567,6 +598,53 @@ AC_ARG_ENABLE([restricted_dl], AC_DEFINE_UNQUOTED([HAVE_RESTRICTED_DL], [$restricted_dl], [Define to 1 to only look for dl providers under default location if FI_PROVIDER_PATH is not set]) +dnl Check kdreg2 support +kdreg2_enabled=1 +have_kdreg2=0 +have_kdreg2_include_path=0 + +AC_ARG_ENABLE([kdreg2], + [AC_HELP_STRING([--disable-kdreg2], + [Determine whether kdreg2 memory monitor is disabled.])], + [AS_IF([test "$enable_kdreg2" = "no"], [kdreg2_enabled=0])], + []) + +AS_IF([test $kdreg2_enabled -ne 0 ], + [AC_CHECK_HEADER([linux/kdreg2.h], [have_kdreg2=1], [], []) + + AC_ARG_WITH([kdreg2], + [AS_HELP_STRING([--with-kdreg2=DIR], + [Enable KDREG2 memory monitor. + Optional=.])], + [AS_CASE(["$with_kdreg2"], + ["no"], [kdreg2_enabled=0], + ["yes"], [], + [""], [], + [CPPFLAGS="$CPPFLAGS -I$with_kdreg2" + AC_CHECK_HEADER([kdreg2.h], + [have_kdreg2=1 + have_kdreg2_include_path=1], + [have_kdreg2=0], + [])]) + AS_IF([test $have_kdreg2 -eq 0 ], + [AC_MSG_ERROR([KDREG2 header not found in $with_kdreg2. Cannot enable KDREG2 memory monitor.])]) + ]) + ]) + +AS_IF([test $kdreg2_enabled -eq 0], + [AC_MSG_NOTICE([kdreg2 monitor disabled])], + [AS_IF([test $have_kdreg2 -ne 0], + [AC_MSG_NOTICE([kdreg2 present and enabled])])]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2, [$have_kdreg2], + [Define to 1 if kdreg2.h is available.]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_INCLUDE_PATH, [$have_kdreg2_include_path], + [Define to 1 if kdreg2.h path is not .]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_MONITOR, [$have_kdreg2], + [Define to 1 to enable kdreg2 memory monitor]) + dnl Check support to intercept syscalls AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) @@ -854,6 +932,27 @@ AC_ARG_ENABLE([uffd-monitor], AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], [Define to 1 to enable uffd memory monitor]) +default_monitor="" +bad_default="0" +AC_ARG_WITH([default-monitor], + [AS_HELP_STRING([--with-default-monitor=], + [Select the default memory monitor.])], + [AS_CASE([$with_default_monitor], + [memhooks],[default_monitor=memhooks], + [uffd],[default_monitor=uffd], + [kdreg2],[default_monitor=kdreg2] + [disabled], [default_monitor=disabled], + [AC_MSG_ERROR([Unknown monitor specified: $with_default_monitor. Choices are memhooks, uffd, or disabled.])]) + AS_CASE([$default_monitor], + [memhooks], [AS_IF([test "$enable_memhooks" != "1"], [bad_default=1])], + [uffd], [AS_IF([test "$enable_uffd" != "1"], [bad_default=1])], + [kdreg2], [AS_IF([test "$kdreg2_enabled" != "1"], [bad_default=1])], + []) + AS_IF([test "$bad_default" != "0"], + [AC_MSG_ERROR(["Default memory monitor is not available: $default_monitor."])]) + AC_DEFINE_UNQUOTED([HAVE_MR_CACHE_MONITOR_DEFAULT], ["$default_monitor"], [Default memory monitor]) + ], + []) AH_BOTTOM([ #if defined(__linux__) && (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)) && ENABLE_MEMHOOKS_MONITOR @@ -1026,6 +1125,7 @@ FI_PROVIDER_SETUP([hook_debug]) FI_PROVIDER_SETUP([hook_hmem]) FI_PROVIDER_SETUP([dmabuf_peer_mem]) FI_PROVIDER_SETUP([opx]) +FI_PROVIDER_SETUP([lnx]) FI_PROVIDER_FINI dnl Configure the .pc file FI_PROVIDER_SETUP_PC diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index e8d11d4edfa..252b60fadc3 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -23,16 +23,11 @@ def download_and_extract_portafiducia(outputDir) { /* Download PortaFiducia tarball from S3 and extract to outputDir */ def tempPath = "/tmp/portafiducia.tar.gz" def downloadPath = this.get_portafiducia_download_path() - - def ret = sh ( - script: "mkdir -p ${outputDir} && aws s3 cp ${downloadPath} ${tempPath} && " + - "tar xf ${tempPath} -C ${outputDir}", - returnStatus: true, - ) - - if (ret != 0) { - unstable('Failed to download and extract PortaFiducia') - } + sh """ + mkdir -p ${outputDir} + aws s3 cp ${downloadPath} ${tempPath} + tar xf ${tempPath} -C ${outputDir} + """ } def install_porta_fiducia() { @@ -48,24 +43,14 @@ def install_porta_fiducia() { ''' } -def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, test_config_file, addl_args) { +def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, addl_args) { /* * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments * param@ args: str, the command line arguments */ def cluster_name = get_cluster_name(build_tag, os, instance_type) - def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" - def ret = sh ( - script: ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}", - returnStatus: true - ) - if (ret == 65) - unstable('Scripts exited with status 65') - else if (ret != 0) - build_ok = false - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - sh "exit ${ret}" - } + def args = "--os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" + sh ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}" } def get_random_string(len) { @@ -76,6 +61,14 @@ def get_random_string(len) { return s } +def get_cluster_name_prefix(build_tag) { + prefix = sh( + script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\" | tr -d '.\\n'", + returnStdout: true + ) + return prefix.take(28) +} + def get_cluster_name(build_tag, os, instance_type) { /* * Compose the cluster name. Pcluster requires a cluster name under 60 characters. @@ -83,47 +76,36 @@ def get_cluster_name(build_tag, os, instance_type) { * Jenkins does not allow groovy to use the replace() method * of string. Therefore we used shell command sed to replace "." with "" */ - build_tag = sh( - script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"", - returnStdout: true - ) + build_tag = get_cluster_name_prefix(build_tag) def cluster_name = sh( - script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", + script: "echo '${build_tag}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", returnStdout: true ) return cluster_name } -def get_single_node_windows_test_stage(stage_name) { +def get_single_node_windows_test_stage_with_lock(stage_name, lock_label) { /* * Get Windows Stage */ return { stage("${stage_name}") { - def ret = sh ( - script: """ - . venv/bin/activate; - cd PortaFiducia/scripts; - export PULL_REQUEST_ID=${env.CHANGE_ID}; - env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; - """, - returnStatus: true - ) - if (ret == 65) - unstable('Scripts exited with status 65') - else if (ret != 0) - build_ok = false - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - sh "exit ${ret}" + lock(label: lock_label, quantity: 1) { + sh """ + . venv/bin/activate; + cd PortaFiducia/scripts; + export PULL_REQUEST_ID=${env.CHANGE_ID}; + env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; + """ } } } } -def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) { +def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, lock_label, addl_args) { /* * Generate a single test stage that run test_orchestrator.py with the given parameters. * param@ stage_name: the name of the stage @@ -132,13 +114,14 @@ def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, reg * param@ instance_type: the instance type for the test stage. * param@ instance_count: number of intances to use * param@ region: the (default) aws region where the tests are run. - * param@ test_config: the name of test config file in PortaFiducia/tests/configs/ * param@ addl_args: additional arguments passed to test_orchestrator.py * return@: the test stage. */ return { stage("${stage_name}") { - this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + lock(label: lock_label, quantity: instance_count) { + this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, addl_args) + } } } } @@ -151,11 +134,8 @@ pipeline { } options { buildDiscarder(logRotator(daysToKeepStr: "90")) - timeout(time: 8, unit: 'HOURS') - } - environment { - // AWS region where the cluster is created - REGION="us-west-2" + timeout(time: 10, unit: 'HOURS') + skipDefaultCheckout() } stages { // Cleanup workspace before job start. @@ -172,7 +152,6 @@ pipeline { stage("Download and extract PortaFiducia") { steps { script { - sh 'printenv' download_and_extract_portafiducia('PortaFiducia') } } @@ -189,46 +168,89 @@ pipeline { steps { script { def stages = [:] - // This needs the extra space at the end - def addl_args_pr = "--test-libfabric-pr $env.CHANGE_ID " + def timeout = "--timeout 210" + def generic_pf = "--cluster-type manual_cluster --test-target libfabric --test-type pr --test-libfabric-pr $env.CHANGE_ID" + // onesided tests are covered by imb, collective tests are covered by omb + def mpi_collective_tests = "'test_omb and not onesided'" + def libfabric_tests = "test_efa_ut test_fabtests_functional test_fork_support test_backward_compatibility" + def one_sided_tests = "'test_imb and not collective'" + def libfabric_and_onesided_tests = "${libfabric_tests} ${one_sided_tests}" + + def efa_provider = "--test-libfabric-provider efa" + def addl_args_efa_libfabric_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" + def addl_args_efa_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests}" + def addl_args_efa_libfabric_and_onesided_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_and_onesided_tests}" + + def shm_provider = "--test-libfabric-provider shm" + def addl_args_shm = "${timeout} ${generic_pf} ${shm_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" + + def tcp_provider = "--test-libfabric-provider tcp --enable-efa false" + def addl_args_tcp = "${timeout} ${generic_pf} ${tcp_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" + + def sockets_provider = "--test-libfabric-provider sockets --enable-efa false" + def addl_args_sockets = "${timeout} ${generic_pf} ${sockets_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" + + // Use lockable resources to limit the number of jobs that can get executed in parallel + def g4dn8x_lock_label = "g4dn8x" + def g4dn12x_lock_label = "g4dn12x" + def c52x_lock_label = "c52x" + def hpc6a48x_lock_label = "hpc6a48x" + def c6gn16x_lock_label = "c6gn16x" + def c5n18x_lock_label = "c5n18x" + def c6g2x_lock_label = "c6g2x" // Single Node Tests - EFA - stages["1_g4dn_alinux2-efa"] = get_test_stage("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_alinux2023-efa"] = get_test_stage("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_ubuntu2004-efa"] = get_test_stage("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_rhel8-efa"] = get_test_stage("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) // Single Node Tests - SHM - stages["1_g4dn_alinux2_shm"] = get_test_stage("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_alinux2023_shm"] = get_test_stage("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_ubuntu2004_shm"] = get_test_stage("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_c5_rhel8_shm"] = get_test_stage("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-efa false") - stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-cma false --enable-efa false") + stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-efa false") + stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-cma false --enable-efa false") // Single Node Windows Test - stages["EFA_Windows_Test"] = get_single_node_windows_test_stage("EFA_Windows_Test") + stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label) // Multi Node Tests - EFA - stages["2_hpc6a_alinux2_efa"] = get_test_stage("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_alinux2023_efa"] = get_test_stage("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c6gn_alinux2_efa"] = get_test_stage("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c6gn_alinux2023_efa"] = get_test_stage("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c5n_alinux2_efa"] = get_test_stage("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c5n_alinux2023_efa"] = get_test_stage("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_rhel8_efa"] = get_test_stage("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_alinux2_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c6gn_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi) + stages["2_c6gn_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c5n_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi) + stages["2_c5n_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c5n_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi) + stages["2_c5n_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_ubuntu2004_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_mpi", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_rhel8_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_mpi", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_rhel8_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_libfabric_and_one_sided", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + + // cg6n AL2 builds are the slowest b/c they have asan turned on with debug, and have slower memcpy speeds + // split "libfabric tests" into "fabtests", and imb + def addl_args_efa_one_sided_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${one_sided_tests}" + def addl_args_efa_libfabric_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_tests}" + stages["2_c6gn_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi) + stages["2_c6gn_alinux2_efa_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_one_sided", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_one_sided_only) + stages["2_c6gn_alinux2_efa_libfabric"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_libfabric", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_only) // Multi Node Tests - TCP - stages["2_c6g_alinux2_tcp"] = get_test_stage("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_alinux2023_tcp"] = get_test_stage("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_ubuntu2004_tcp"] = get_test_stage("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_rhel8_tcp"] = get_test_stage("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", g4dn12x_lock_label, addl_args_tcp + " --test-list test_nccl_tests --test-iterations fastest") // Multi Node Tests - SOCKETS - stages["2_c6g_alinux2_sockets"] = get_test_stage("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_alinux2023_sockets"] = get_test_stage("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_ubuntu2004_sockets"] = get_test_stage("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_rhel8_sockets"] = get_test_stage("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_alinux2023_sockets"] = get_test_stage_with_lock("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_ubuntu2004_sockets"] = get_test_stage_with_lock("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_rhel8_sockets"] = get_test_stage_with_lock("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) parallel stages } @@ -252,17 +274,20 @@ pipeline { sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc' junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*' + script { + // Try To Cleanup Resources + def regions = ["us-east-1", "eu-north-1", "us-west-2"] + cluster_name_prefix = get_cluster_name_prefix(env.BUILD_TAG) + regions.each { region -> + sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name '${cluster_name_prefix}*' --region ${region}" + } + // Windows Cluster, has a different name + sh """ + . venv/bin/activate + ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_* + """ + } } - failure { - sh ''' - . venv/bin/activate - ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_* - ''' - } - aborted { - sh '. venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name "$BUILD_TAG"\'*\' --region $REGION' - } - // Cleanup workspace after job completes. cleanup { deleteDir() } diff --git a/contrib/cray/build.sh b/contrib/cray/build.sh new file mode 100644 index 00000000000..71faaeb289c --- /dev/null +++ b/contrib/cray/build.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# Copyright 2024 Hewlett Packard Enterprise Development LP. All rights reserved. +# + +set -Exeuo pipefail + +CE_BUILD_SCRIPT_REPO=hpc-shs-ce-devops +CE_CONFIG_BRANCH=${CE_CONFIG_BRANCH:-main} +if [ -d ${CE_BUILD_SCRIPT_REPO} ]; then + git -C ${CE_BUILD_SCRIPT_REPO} checkout ${CE_CONFIG_BRANCH} + git -C ${CE_BUILD_SCRIPT_REPO} pull +else + git clone --branch "${CE_CONFIG_BRANCH}" https://$HPE_GITHUB_TOKEN@github.hpe.com/hpe/${CE_BUILD_SCRIPT_REPO}.git +fi + +. ${CE_BUILD_SCRIPT_REPO}/build/sh/rpmbuild/load.sh + +setup_dst_env +dst_build_rpm -c ${CE_BUILD_SCRIPT_REPO}/build/configs/${CE_CONFIG_FILE} $@ + diff --git a/contrib/cray/run.cxi.jenkins b/contrib/cray/run.cxi.jenkins new file mode 100755 index 00000000000..5f4a60fbe41 --- /dev/null +++ b/contrib/cray/run.cxi.jenkins @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +PRODUCT=slingshot-host-software-internal \ +CE_CONFIG_FILE=libfabric-cxi.yaml \ + bash $DIR/build.sh -n $@ diff --git a/contrib/cray/run.cxi.local b/contrib/cray/run.cxi.local new file mode 100755 index 00000000000..12dd4bbbccc --- /dev/null +++ b/contrib/cray/run.cxi.local @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +SHS_LOCAL_BUILD=y \ +BRANCH_NAME=${BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD)} \ +PRODUCT=slingshot-host-software-internal \ +CE_CONFIG_FILE=libfabric-cxi.yaml \ + bash $DIR/build.sh $@ diff --git a/contrib/cray/run.verbs.jenkins b/contrib/cray/run.verbs.jenkins new file mode 100755 index 00000000000..9c18551d27d --- /dev/null +++ b/contrib/cray/run.verbs.jenkins @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +CE_CONFIG_FILE=libfabric-verbs.yaml \ + bash $DIR/build.sh -n $@ diff --git a/contrib/cray/run.verbs.local b/contrib/cray/run.verbs.local new file mode 100755 index 00000000000..44aaf8a12df --- /dev/null +++ b/contrib/cray/run.verbs.local @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +SHS_LOCAL_BUILD=y \ +BRANCH_NAME=${BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD)} \ +CE_CONFIG_FILE=libfabric-verbs.yaml \ + bash $DIR/build.sh $@ diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index b9a2aa1ad93..7385ad5ae91 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -7,6 +7,7 @@ properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def PYTHON_VERSION="3.9" @Field def TIMEOUT="7200" +@Field def weekly=false def run_python(version, command, output=null) { if (output != null) @@ -16,10 +17,10 @@ def run_python(version, command, output=null) { } def slurm_batch(partition, node_num, output, command) { - try { sh """sbatch --partition=${partition} -N ${node_num} \ --wait -o ${output} --open-mode=append \ + -J ${env.SLURM_JOB_NAME} \ --wrap=\'env; timeout $TIMEOUT ${command}\' """ } catch (Exception e) { @@ -101,9 +102,16 @@ def run_ci(stage_name, config_name) { """ } +def build_ci(config_name) { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python run.py \ + --output=${env.CUSTOM_WORKSPACE}/pre-build \ + --job=${config_name} + """ +} + def gather_logs(cluster, key, dest, source) { def address = "${env.USER}@${cluster}" - try { sh "scp -r -i ${key} ${address}:${source}/* ${dest}/" } catch (Exception e) { @@ -112,18 +120,19 @@ def gather_logs(cluster, key, dest, source) { } def CI_summarize(verbose=false) { + cmd = """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python ${CI_LOCATION}/summarize.py \ + --log_directory=${env.LOG_DIR} + """ if (verbose) { - sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ - python ${CI_LOCATION}/summarize.py \ - --log_directory=${env.LOG_DIR} \ - -v - """ - } else { - sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ - python ${CI_LOCATION}/summarize.py \ - --log_directory=${env.LOG_DIR} - """ + cmd = "${cmd} -v" + } + + if (weekly || RELEASE) { + cmd = "${cmd} --send-mail" } + + sh "${cmd}" } def summarize(item, verbose=false, release=false, send_mail=false) { @@ -200,27 +209,17 @@ def checkout_external_resources() { checkout_ci() } -def generate_diff(def branch_name, def output_loc) { - sh """ - git remote add mainRepo ${env.UPSTREAM} - git fetch mainRepo - git diff --name-only HEAD..mainRepo/${branch_name} > ${output_loc}/commit_id - git remote remove mainRepo - """ -} +def git_diffs() { + dir ("${CUSTOM_WORKSPACE}/source/libfabric") { + sh """ + git diff --name-only HEAD..upstream/${TARGET} > ./commit_id + git diff upstream/${TARGET}:Makefile.am Makefile.am > ./Makefile.am.diff + git diff upstream/${TARGET}:configure.ac configure.ac > ./configure.ac.diff -def generate_release_num(def branch_name, def output_loc) { - sh """ - git remote add mainRepo ${env.UPSTREAM} - git fetch mainRepo - git diff mainRepo/${branch_name}:Makefile.am Makefile.am > \ - ${output_loc}/Makefile.am.diff - git diff mainRepo/${branch_name}:configure.ac configure.ac > \ - ${output_loc}/configure.ac.diff - cat configure.ac | grep AC_INIT | cut -d ' ' -f 2 | \ - cut -d '[' -f 2 | cut -d ']' -f 1 > ${output_loc}/release_num.txt - git remote remove mainRepo - """ + cat configure.ac | grep AC_INIT | cut -d ' ' -f 2 | \ + cut -d '[' -f 2 | cut -d ']' -f 1 > ./release_num.txt + """ + } } def slurm_build(modes, partition, location, tag, hw=null, additional_args=null) { @@ -276,10 +275,32 @@ def build(item, mode=null, hw=null, additional_args=null) { run_python(PYTHON_VERSION, cmd) } -def build_ci() { +def bootstrap_ci() { sh "${CI_LOCATION}/${env.CI_MODULE}/bootstrap.sh" } +def checkout_tar(name) { + if (env.WEEKLY == null) { + weekly = false + } else { + weekly = env.WEEKLY.toBoolean() + } + dir ("${env.CUSTOM_WORKSPACE}/${name}/libfabric") { + checkout scm + TARGET=check_target() + if (weekly) { + TARGET=env.WEEKLY_TARGET + } + sh """ + git remote add upstream ${env.UPSTREAM} + git pull --rebase upstream ${TARGET} + """ + } + dir ("${env.CUSTOM_WORKSPACE}/${name}/") { + sh "tar -cvf libfabric.tar.gz libfabric/*" + } +} + def check_target() { echo "CHANGE_TARGET = ${env.CHANGE_TARGET}" if (changeRequest()) { @@ -331,7 +352,7 @@ def skip() { } echo "Changeset is: ${changeStrings.toArray()}" - if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|contrib\/aws).*$/ }) { + if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|prov\/cxi|prov\/lpp|contrib\/aws).*$/ }) { echo "DONT RUN!" return true } @@ -347,7 +368,7 @@ def skip() { pipeline { agent { node { - label 'main' + label 'cbj-main' customWorkspace "workspace/${JOB_NAME}/${env.BUILD_NUMBER}" } } @@ -361,6 +382,7 @@ pipeline { WITH_ENV="'PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH'" CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" DELETE_LOCATION="${env.CUSTOM_WORKSPACE}/middlewares" + SLURM_JOB_NAME="${env.JOB_NAME}_${env.BUILD_NUMBER}" RUN_LOCATION="${env.CUSTOM_WORKSPACE}/ci_resources/legacy_pipeline_scripts/" CI_LOCATION="${env.CUSTOM_WORKSPACE}/ci" LOG_DIR = "${env.CUSTOM_WORKSPACE}/log_dir" @@ -369,47 +391,34 @@ pipeline { stage ('checkout') { steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/grass/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/water/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/electric/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/ucx/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/cuda/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/iouring/libfabric") { - checkout scm - } + checkout_tar("source") dir (CUSTOM_WORKSPACE) { checkout_external_resources() } } } } + stage ('bootstrap-ci') { + steps { + script { + bootstrap_ci() + } + } + } + stage('check-authorization') { + steps { + script { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python ${CI_LOCATION}/authorize.py \ + --author=${env.CHANGE_AUTHOR} + """ + } + } + } stage ('opt-out') { steps { script { - TARGET=check_target() - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { - generate_diff("${TARGET}", "${env.WORKSPACE}/source/libfabric") - generate_release_num("${TARGET}", "${env.WORKSPACE}/source/libfabric") - } - - if (env.WEEKLY == null) { - weekly = false - } else { - weekly = env.WEEKLY.toBoolean() - } + git_diffs() if (weekly) { TIMEOUT="21600" } @@ -421,6 +430,16 @@ pipeline { } } } + stage ('health check') { + when { equals expected: true, actual: DO_RUN } + steps { + script { + dir (CI_LOCATION) { + sh "./temperature.sh" + } + } + } + } stage ('prepare build') { when { equals expected: true, actual: DO_RUN } steps { @@ -432,84 +451,55 @@ pipeline { } } } - stage ('parallel-builds') { + stage ('build-libfabric') { when { equals expected: true, actual: DO_RUN } parallel { - stage ('build-ci') { - steps { - script { - build_ci() - } - } - } - stage ('build-water') { - steps { - script { - slurm_build(BUILD_MODES, "totodile", "water", "water", "water") - slurm_batch("totodile", "1", - "${env.LOG_DIR}/build_mpich_water_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=water""" - ) - slurm_batch("totodile", "1", - "${env.LOG_DIR}/build_shmem_water_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=shmem --build_hw=water""" - ) - } - } - } - stage ('build-grass') { + stage ('water') { steps { script { - slurm_build(BUILD_MODES, "grass", "grass", "grass", "grass") - slurm_batch("grass", "1", - "${env.LOG_DIR}/build_mpich_grass_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=grass""" - ) - slurm_batch("grass", "1", - "${env.LOG_DIR}/build_shmem_grass_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=shmem --build_hw=grass""" - ) + dir (CI_LOCATION) { + build_ci("pr_build_water.json") + } } } } - stage ('build-electric') { + stage ('grass') { steps { script { - slurm_build(BUILD_MODES, "electric", "electric", "electric", - "electric") + dir (CI_LOCATION) { + build_ci("pr_build_grass.json") + } } } } - stage ('build-ucx') { + stage ('electric') { steps { script { - slurm_build(BUILD_MODES, "totodile", "ucx", "ucx", "ucx") + dir (CI_LOCATION) { + build_ci("pr_build_electric.json") + } } } } - stage ('build-cuda') { + stage ('cyndaquil') { steps { script { - slurm_build(["reg"], "cyndaquil", "cuda", "cyndaquil", - "cyndaquil", "--cuda") - slurm_build(["reg"], "quilava", "cuda", "quilava", - "quilava", "--cuda") + dir (CI_LOCATION) { + build_ci("pr_build_cyndaquil.json") + } } } } - stage ('build-iouring') { + stage ('quilava') { steps { script { - slurm_build(BUILD_MODES, "ivysaur", "iouring", "ivysaur", - "ivysaur") + dir (CI_LOCATION) { + build_ci("pr_build_quilava.json") + } } } } - stage ('build-daos') { + stage ('daos') { agent { node { label 'daos_head' @@ -519,17 +509,19 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { checkout scm } + checkout_tar("source") checkout_external_resources() dir (CUSTOM_WORKSPACE) { build("logdir") - build("libfabric", "reg", "daos") - build("fabtests", "reg", "daos") + } + bootstrap_ci() + dir (CI_LOCATION) { + build_ci("pr_build_daos.json") } } } } - stage ('build-gpu') { + stage ('fire') { agent { node { label 'ze' @@ -539,14 +531,82 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { checkout scm } + checkout_tar("source") checkout_external_resources() dir (CUSTOM_WORKSPACE) { build("logdir") build("builddir") - build_ci() - slurm_build(BUILD_MODES, "fabrics-ci", "source", "ze", "gpu", - "--gpu") + } + bootstrap_ci() + dir (CI_LOCATION) { + build_ci("pr_build_fire.json") + } + } + } + } + } + } + stage('build-middlewares') { + when { equals expected: true, actual: DO_RUN } + parallel { + stage ('shmem-water') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_shmem_water.json") + } + } + } + } + stage ('shmem-grass') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_shmem_grass.json") + } + } + } + } + stage ('ompi-water') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_ompi_water.json") + } + } + } + } + stage ('ompi-grass') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_ompi_grass.json") + } + } + } + } + stage ('mpich-water') { + steps { + script { + dir (CI_LOCATION) { + slurm_batch("water", "1", + "${env.LOG_DIR}/build_mpich_water_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich --build_hw=water""" + ) + } + } + } + } + stage ('mpich-grass') { + steps { + script { + dir (CI_LOCATION) { + slurm_batch("grass", "1", + "${env.LOG_DIR}/build_mpich_grass_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich --build_hw=grass""" + ) } } } @@ -556,6 +616,41 @@ pipeline { stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { + stage('mpichtestsuite-tcp') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [['tcp', null]] + def MPIS = ["mpich"] + if (env.WEEKLY.toBoolean()) { + MPIS = ["impi", "mpich"] + } + for (def mpi in MPIS) { + run_middleware(providers, "mpichtestsuite", "mpichtestsuite", + "grass", "bulbasaur,ivysaur", "2", "${mpi}") + } + } + } + } + } + stage('mpichtestsuite-verbs') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [["verbs","rxm"]] + def MPIS = ["mpich"] + if (env.WEEKLY.toBoolean()) { + MPIS = ["impi", "mpich"] + } + for (def mpi in MPIS) { + run_middleware(providers, "mpichtestsuite", "mpichtestsuite", + "water", "squirtle,wartortle,articuno", "2", + "${mpi}") + } + } + } + } + } stage ('CI_mpi_verbs-rxm_imb') { steps { script { @@ -603,6 +698,19 @@ pipeline { } } } + stage('CI_fabtests_tcp-rxm') { + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_fabtests_tcp-rxm_reg", + "pr_fabtests_tcp-rxm_reg.json") + run_ci("CI_fabtests_tcp-rxm_dbg", + "pr_fabtests_tcp-rxm_dbg.json") + run_ci("CI_fabtests_tcp-rxm_dl", "pr_fabtests_tcp-rxm_dl.json") + } + } + } + } stage('CI_fabtests_sockets') { steps { script { @@ -636,13 +744,16 @@ pipeline { } } } - stage('CI_fabtests_ivysaur') { + stage('CI_fabtests_tcp_io_uring') { steps { script { dir (CI_LOCATION) { - run_ci("CI_fabtests_ivysaur_reg", "pr_fabtests_ivysaur_reg.json") - run_ci("CI_fabtests_ivysaur_dbg", "pr_fabtests_ivysaur_dbg.json") - run_ci("CI_fabtests_ivysaur_dl", "pr_fabtests_ivysaur_dl.json") + run_ci("CI_fabtests_tcp_io_uring_reg", + "pr_fabtests_tcp_io_uring_reg.json") + run_ci("CI_fabtests_tcp_io_uring_dbg", + "pr_fabtests_tcp_io_uring_dbg.json") + run_ci("CI_fabtests_tcp_io_uring_dl", + "pr_fabtests_tcp_io_uring_dl.json") } } } @@ -691,29 +802,12 @@ pipeline { } } } - stage('mpichtestsuite') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [['tcp', null], ["verbs","rxm"]] - def MPIS = ["mpich"] - if (env.WEEKLY.toBoolean()) { - MPIS = ["impi", "mpich"] - } - for (def mpi in MPIS) { - run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "water", "totodile", "2", "${mpi}") - } - } - } - } - } stage('SHMEM_grass') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null]], "SHMEM", "shmem", - "grass", "bulbasaur,chikorita", "2") + dir (CI_LOCATION) { + run_ci("CI_shmem_grass", "pr_shmem_1n2ppn_grass.json") + run_ci("CI_shmem_grass", "pr_shmem_2n1ppn_grass.json") } } } @@ -721,9 +815,9 @@ pipeline { stage('SHMEM_water') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"], ["sockets", null]], "SHMEM", - "shmem", "water", "totodile", "2") + dir (CI_LOCATION) { + run_ci("CI_shmem_water", "pr_shmem_1n2ppn_water.json") + run_ci("CI_shmem_water", "pr_shmem_2n1ppn_water.json") } } } @@ -731,12 +825,11 @@ pipeline { stage ('multinode_performance') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null],["sockets", null]], - "multinode_performance", "multinode", "grass", - "bulbasaur,chikorita", "2") - run_middleware([["verbs", "rxm"]], "multinode_performance", - "multinode", "water", "totodile", "2") + dir (CI_LOCATION) { + run_ci("CI_multinode_performance_grass", + "pr_multinode_performance_grass.json") + run_ci("CI_multinode_performance_water", + "pr_multinode_performance_water.json") } } } @@ -746,13 +839,15 @@ pipeline { script { dir (RUN_LOCATION) { run_middleware([["verbs", null]], "oneCCL", - "oneccl", "water", "totodile", "2") + "oneccl", "water", + "squirtle,wartortle,articuno", "2") run_middleware([["shm", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,chikorita", "1") + "oneccl", "grass", "bulbasaur,ivysaur", "1") run_middleware([["psm3", null]], "oneCCL", - "oneccl", "water", "totodile", "2") + "oneccl", "water", + "squirtle,wartortle,articuno", "2") run_middleware([["tcp", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,chikorita", "2") + "oneccl", "grass", "bulbasaur,ivysaur", "2") run_middleware([["shm", null]], "oneCCL_DSA", "oneccl", "electric", "pikachu", "1", null, null, """CCL_ATL_SHM=1 FI_SHM_DISABLE_CMA=1 \ @@ -767,18 +862,18 @@ pipeline { steps { script { dir (RUN_LOCATION) { - run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") - run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", + "fire", "torchic", "1", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", + "fire", "torchic", "1", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") - run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") - run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") } } } @@ -819,7 +914,7 @@ pipeline { dir (RUN_LOCATION) { dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf" cmd = """ python3.9 runtests.py --test=dmabuf \ - --prov=verbs --util=rxm --build_hw=gpu""" + --prov=verbs --util=rxm --build_hw=fire""" slurm_batch("torchic", "1", "${dmabuf_output}_reg", "${cmd}") } @@ -909,24 +1004,6 @@ pipeline { summarize("all") } } - } - aborted { - node ('daos_head') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - node ('ze') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - success { - script { - if (DO_RUN) { - CI_summarize(verbose=true) - summarize("all", verbose=true, release=false, - send_mail=env.WEEKLY.toBoolean()) - } - } node ('daos_head') { dir("${env.WORKSPACE}") { deleteDir() } dir("${env.WORKSPACE}@tmp") { deleteDir() } diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index ac193830365..a60ec46cb41 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -22,10 +22,8 @@ bin_PROGRAMS = \ functional/fi_rdm_deferred_wq \ functional/fi_dgram \ functional/fi_mcast \ - functional/fi_dgram_waitset \ functional/fi_rdm_tagged_peek \ functional/fi_cq_data \ - functional/fi_poll \ functional/fi_scalable_ep \ functional/fi_shared_ctx \ functional/fi_msg_epoll \ @@ -42,7 +40,7 @@ bin_PROGRAMS = \ functional/fi_rdm_atomic \ functional/fi_rdm_stress \ functional/fi_multi_recv \ - functional/fi_bw \ + functional/fi_flood \ functional/fi_rdm_multi_client \ functional/fi_loopback \ benchmarks/fi_msg_pingpong \ @@ -161,7 +159,6 @@ nobase_dist_config_DATA = \ pytest/default/test_msg.py \ pytest/default/test_multinode.py \ pytest/default/test_multi_recv.py \ - pytest/default/test_poll.py \ pytest/default/test_rdm.py \ pytest/default/test_recv_cancel.py \ pytest/default/test_rma_bw.py \ @@ -232,6 +229,8 @@ libfabtests_la_SOURCES = \ common/hmem_ze.c \ common/hmem_neuron.c \ common/hmem_synapseai.c \ + common/ofi_atomic.c \ + include/ofi_atomic.h \ include/shared.h \ include/ft_list.h \ include/hmem.h \ @@ -297,10 +296,6 @@ functional_fi_mcast_SOURCES = \ functional/mcast.c functional_fi_mcast_LDADD = libfabtests.la -functional_fi_dgram_waitset_SOURCES = \ - functional/dgram_waitset.c -functional_fi_dgram_waitset_LDADD = libfabtests.la - functional_fi_rdm_tagged_peek_SOURCES = \ functional/rdm_tagged_peek.c functional_fi_rdm_tagged_peek_LDADD = libfabtests.la @@ -321,10 +316,6 @@ functional_fi_shared_ctx_SOURCES = \ functional/shared_ctx.c functional_fi_shared_ctx_LDADD = libfabtests.la -functional_fi_poll_SOURCES = \ - functional/poll.c -functional_fi_poll_LDADD = libfabtests.la - functional_fi_multi_ep_SOURCES = \ functional/multi_ep.c functional_fi_multi_ep_LDADD = libfabtests.la @@ -369,9 +360,9 @@ functional_fi_multi_recv_SOURCES = \ functional/multi_recv.c functional_fi_multi_recv_LDADD = libfabtests.la -functional_fi_bw_SOURCES = \ - functional/bw.c -functional_fi_bw_LDADD = libfabtests.la +functional_fi_flood_SOURCES = \ + functional/flood.c +functional_fi_flood_LDADD = libfabtests.la functional_fi_rdm_multi_client_SOURCES = \ functional/rdm_multi_client.c @@ -479,8 +470,6 @@ unit_fi_setopt_test_LDADD = libfabtests.la ubertest_fi_ubertest_SOURCES = \ ubertest/fabtest.h \ - ubertest/ofi_atomic.h \ - ubertest/ofi_atomic.c \ ubertest/uber.c \ ubertest/connect.c \ ubertest/cq.c \ @@ -624,7 +613,6 @@ dummy_man_pages = \ man/man1/fi_cm_data.1 \ man/man1/fi_cq_data.1 \ man/man1/fi_dgram.1 \ - man/man1/fi_dgram_waitset.1 \ man/man1/fi_inj_complete.1 \ man/man1/fi_mcast.1 \ man/man1/fi_msg.1 \ @@ -632,7 +620,6 @@ dummy_man_pages = \ man/man1/fi_msg_sockets.1 \ man/man1/fi_multi_ep.1 \ man/man1/fi_multi_mr.1 \ - man/man1/fi_poll.1 \ man/man1/fi_rdm.1 \ man/man1/fi_rdm_atomic.1 \ man/man1/fi_rdm_deferred_wq.1 \ @@ -664,7 +651,7 @@ dummy_man_pages = \ man/man1/fi_eq_test.1 \ man/man1/fi_getinfo_test.1 \ man/man1/fi_mr_test.1 \ - man/man1/fi_bw.1 \ + man/man1/fi_flood.1 \ man/man1/fi_rdm_multi_client.1 \ man/man1/fi_ubertest.1 \ man/man1/fi_efa_ep_rnr_retry.1 diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index e6b4f8a76b4..fb7924227c8 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -46,7 +46,7 @@ CFLAGS = $(CFLAGS) /O2 /MT basedeps = common\hmem.c common\shared.c \ common\windows\getopt.c common\windows\osd.c \ common\hmem_cuda.c common\hmem_rocr.c common\hmem_ze.c \ - common\hmem_neuron.c common\hmem_synapseai.c + common\hmem_neuron.c common\hmem_synapseai.c common\ofi_atomic.c includes = /Iinclude /Iinclude\windows /I..\include /FIft_osd.h \ /Iinclude\windows\getopt /Imultinode\include @@ -77,10 +77,10 @@ benchmarks: $(outdir)\dgram_pingpong.exe $(outdir)\msg_bw.exe \ $(outdir)\rdm_pingpong.exe $(outdir)\rma_pingpong.exe $(outdir)\rdm_tagged_bw.exe \ $(outdir)\rdm_bw.exe $(outdir)\rdm_tagged_pingpong.exe $(outdir)\rma_bw.exe -functional: $(outdir)\av_xfer.exe $(outdir)\bw.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ - $(outdir)\dgram.exe $(outdir)\dgram_waitset.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ +functional: $(outdir)\av_xfer.exe $(outdir)\flood.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ + $(outdir)\dgram.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ $(outdir)\inject_test.exe $(outdir)\msg_sockets.exe $(outdir)\multi_mr.exe \ - $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\poll.exe $(outdir)\rdm.exe \ + $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\rdm.exe \ $(outdir)\rdm_atomic.exe $(outdir)\rdm_multi_client.exe $(outdir)\rdm_rma_event.exe \ $(outdir)\rdm_rma_trigger.exe $(outdir)\rdm_shared_av.exe $(outdir)\rdm_tagged_peek.exe \ $(outdir)\recv_cancel.exe $(outdir)\scalable_ep.exe $(outdir)\shared_ctx.exe \ @@ -120,7 +120,7 @@ $(outdir)\rma_bw.exe: {benchmarks}rma_bw.c $(basedeps) {benchmarks}benchmark_sha $(outdir)\av_xfer.exe: {functional}av_xfer.c $(basedeps) -$(outdir)\bw.exe: {functional}bw.c $(basedeps) +$(outdir)\flood.exe: {functional}flood.c $(basedeps) $(outdir)\cm_data.exe: {functional}cm_data.c $(basedeps) @@ -128,8 +128,6 @@ $(outdir)\cq_data.exe: {functional}cq_data.c $(basedeps) $(outdir)\dgram.exe: {functional}dgram.c $(basedeps) -$(outdir)\dgram_waitset.exe: {functional}dgram_waitset.c $(basedeps) - $(outdir)\msg.exe: {functional}msg.c $(basedeps) $(outdir)\msg_epoll.exe: {functional}msg_epoll.c $(basedeps) @@ -144,8 +142,6 @@ $(outdir)\multi_ep.exe: {functional}multi_ep.c $(basedeps) $(outdir)\multi_recv.exe: {functional}multi_recv.c $(basedeps) -$(outdir)\poll.exe: {functional}poll.c $(basedeps) - $(outdir)\rdm.exe: {functional}rdm.c $(basedeps) $(outdir)\rdm_atomic.exe: {functional}rdm_atomic.c $(basedeps) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 935b7961cb8..7a4ee95ae89 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -70,6 +70,9 @@ void ft_parse_benchmark_opts(int op, char *optarg) case 'W': opts.window_size = atoi(optarg); break; + case 'r': + opts.options |= FT_OPT_NO_PRE_POSTED_RX; + break; default: break; } @@ -84,29 +87,63 @@ void ft_benchmark_usage(void) "* The following condition is required to have at least " "one window\nsize # of messsages to be sent: " "# of iterations > window size"); + FT_PRINT_OPTS_USAGE("-r", "Do not pre post RX buffers"); + FT_PRINT_OPTS_USAGE("", "Only the following tests support this option for now:"); + FT_PRINT_OPTS_USAGE("", "\tfi_rdm_tagged_pingpong"); + FT_PRINT_OPTS_USAGE("", "\tfi_rdm_pingpong"); } -int pingpong(void) +/* Pingpong latency test with pre-posted receive buffers. */ +static int pingpong_pre_posted_rx(size_t inject_size) { int ret, i; - size_t inject_size = fi->tx_attr->inject_size; - ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, - &inject_size, &(size_t){sizeof inject_size}); - if (ret && ret != -FI_ENOPROTOOPT) { - FT_PRINTERR("fi_getopt(FI_OPT_INJECT_MSG_SIZE)", ret); - return ret; - } + if (opts.dst_addr) { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + if (i == opts.warmup_iterations) + ft_start(); - if (inject_size_set) - inject_size = opts.inject_size; + if (opts.transfer_size <= inject_size) + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); + else + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); + if (ret) + return ret; - if (opts.options & FT_OPT_ENABLE_HMEM) - inject_size = 0; + ret = ft_rx(ep, opts.transfer_size); + if (ret) + return ret; + } + } else { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + if (i == opts.warmup_iterations) + ft_start(); - ret = ft_sync(); - if (ret) - return ret; + ret = ft_rx(ep, opts.transfer_size); + if (ret) + return ret; + + if (opts.transfer_size <= inject_size) + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); + else + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); + if (ret) + return ret; + } + } + ft_stop(); + + return FI_SUCCESS; +} + +/* Pingpong latency test without pre-posted receive buffers. */ +static int pingpong_no_pre_posted_rx(size_t inject_size) +{ + int ret, i; if (opts.dst_addr) { for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { @@ -114,13 +151,19 @@ int pingpong(void) ft_start(); if (opts.transfer_size <= inject_size) - ret = ft_inject(ep, remote_fi_addr, opts.transfer_size); + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); else - ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); if (ret) return ret; - ret = ft_rx(ep, opts.transfer_size); + ret = ft_post_rx(ep, opts.transfer_size, &rx_ctx); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); if (ret) return ret; } @@ -129,20 +172,83 @@ int pingpong(void) if (i == opts.warmup_iterations) ft_start(); - ret = ft_rx(ep, opts.transfer_size); + ret = ft_post_rx(ep, opts.transfer_size, &rx_ctx); if (ret) return ret; + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + ret = ft_check_buf((char *) rx_buf + ft_rx_prefix_size(), + opts.transfer_size); + if (ret) + return ret; + } + if (opts.transfer_size <= inject_size) - ret = ft_inject(ep, remote_fi_addr, opts.transfer_size); + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); else - ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); if (ret) return ret; } } ft_stop(); + return FI_SUCCESS; +} + +int pingpong(void) +{ + int ret; + size_t inject_size = fi->tx_attr->inject_size; + + ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_size, &(size_t){sizeof inject_size}); + if (ret && ret != -FI_ENOPROTOOPT) { + FT_PRINTERR("fi_getopt(FI_OPT_INJECT_MSG_SIZE)", ret); + return ret; + } + + if (inject_size_set) + inject_size = opts.inject_size; + + if (opts.options & FT_OPT_ENABLE_HMEM) + inject_size = 0; + + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX)) { + if (ft_check_opts(FT_OPT_OOB_SYNC)) { + ret = ft_sync_oob(); + if (ret) + return ret; + } else { + /* Repost RX buffers to support inband sync. */ + ret = ft_post_rx(ep, rx_size, &rx_ctx); + if (ret) + return ret; + + ret = ft_sync_inband(false); + if (ret) + return ret; + } + + ret = pingpong_no_pre_posted_rx(inject_size); + if (ret) + return ret; + } else { + ret = ft_sync(); + if (ret) + return ret; + + ret = pingpong_pre_posted_rx(inject_size); + if (ret) + return ret; + } + if (opts.machr) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2, opts.argc, opts.argv); @@ -152,6 +258,34 @@ int pingpong(void) return 0; } +int run_pingpong(void) +{ + int i, ret = 0; + + ret = ft_init_fabric(); + if (ret) + return ret; + + if (!(opts.options & FT_OPT_SIZE)) { + for (i = 0; i < TEST_CNT; i++) { + if (!ft_use_size(i, opts.sizes_enabled)) + continue; + opts.transfer_size = test_size[i].size; + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong(); + if (ret) + return ret; + } + } else { + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong(); + if (ret) + return ret; + } + + return ft_finalize(); +} + int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) { int ret, i; @@ -291,7 +425,7 @@ static int rma_bw_rx_comp() return ft_tx(ep, remote_fi_addr, FT_RMA_SYNC_MSG_BYTES, &tx_ctx); } -static int set_fi_more_flag(int i, int j, int flags) +static uint64_t set_fi_more_flag(int i, int j, uint64_t flags) { if (j < opts.window_size - 1 && i >= opts.warmup_iterations && i < opts.iterations + opts.warmup_iterations - 1) { @@ -304,7 +438,8 @@ static int set_fi_more_flag(int i, int j, int flags) int bandwidth(void) { - int ret, i, j, flags = 0; + int ret, i, j; + uint64_t flags = 0; size_t inject_size = fi->tx_attr->inject_size; ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, @@ -377,10 +512,20 @@ int bandwidth(void) if (i == opts.warmup_iterations) ft_start(); - ret = ft_post_rx_buf(ep, opts.transfer_size, - &rx_ctx_arr[j].context, - rx_ctx_arr[j].buf, mr_desc, - ft_tag); + if (opts.use_fi_more) { + flags = set_fi_more_flag(i, j, flags); + ret = ft_recvmsg(ep, remote_fi_addr, + rx_ctx_arr[j].buf, + MAX(opts.transfer_size, + FT_MAX_CTRL_MSG) + + ft_rx_prefix_size(), + &rx_ctx_arr[j].context, flags); + } else { + ret = ft_post_rx_buf(ep, opts.transfer_size, + &rx_ctx_arr[j].context, + rx_ctx_arr[j].buf, mr_desc, + ft_tag); + } if (ret) return ret; @@ -435,7 +580,8 @@ static int bw_rma_comp(enum ft_rma_opcodes rma_op, int num_completions) int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) { - int ret, i, j, flags = 0; + int ret, i, j; + uint64_t flags = 0; size_t offset, inject_size = fi->tx_attr->inject_size; ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, diff --git a/fabtests/benchmarks/benchmark_shared.h b/fabtests/benchmarks/benchmark_shared.h index 1dcc7352fea..fbaf3eb3075 100644 --- a/fabtests/benchmarks/benchmark_shared.h +++ b/fabtests/benchmarks/benchmark_shared.h @@ -40,12 +40,13 @@ extern "C" { #include -#define BENCHMARK_OPTS "vkj:W:" +#define BENCHMARK_OPTS "rvkj:W:" #define FT_BENCHMARK_MAX_MSG_SIZE (test_size[TEST_CNT - 1].size) void ft_parse_benchmark_opts(int op, char *optarg); void ft_benchmark_usage(void); int pingpong(void); +int run_pingpong(void); int bandwidth(void); int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote); int bandwidth_rma(enum ft_rma_opcodes op, struct fi_rma_iov *remote); diff --git a/fabtests/benchmarks/dgram_pingpong.c b/fabtests/benchmarks/dgram_pingpong.c index cab09f206bf..63774e31aa7 100644 --- a/fabtests/benchmarks/dgram_pingpong.c +++ b/fabtests/benchmarks/dgram_pingpong.c @@ -127,9 +127,8 @@ int main(int argc, char **argv) if (opts.options & FT_OPT_SIZE) hints->ep_attr->max_msg_size = opts.transfer_size; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/msg_bw.c b/fabtests/benchmarks/msg_bw.c index 751fd324bc6..9d613a0aa11 100644 --- a/fabtests/benchmarks/msg_bw.c +++ b/fabtests/benchmarks/msg_bw.c @@ -116,7 +116,6 @@ int main(int argc, char **argv) } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; hints->tx_attr->tclass = FI_TC_BULK_DATA; diff --git a/fabtests/benchmarks/rdm_bw.c b/fabtests/benchmarks/rdm_bw.c index b355f21f5f1..6b445db6fc0 100644 --- a/fabtests/benchmarks/rdm_bw.c +++ b/fabtests/benchmarks/rdm_bw.c @@ -80,9 +80,8 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_cntr_pingpong.c b/fabtests/benchmarks/rdm_cntr_pingpong.c index 76eebaf515d..6793f10e806 100644 --- a/fabtests/benchmarks/rdm_cntr_pingpong.c +++ b/fabtests/benchmarks/rdm_cntr_pingpong.c @@ -106,7 +106,7 @@ int main(int argc, char **argv) hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; ret = run(); diff --git a/fabtests/benchmarks/rdm_pingpong.c b/fabtests/benchmarks/rdm_pingpong.c index d521fc2eee0..f4d4169672d 100644 --- a/fabtests/benchmarks/rdm_pingpong.c +++ b/fabtests/benchmarks/rdm_pingpong.c @@ -36,34 +36,6 @@ #include "shared.h" #include "benchmark_shared.h" -static int run(void) -{ - int i, ret = 0; - - ret = ft_init_fabric(); - if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - return ret; - } - } else { - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - return ret; - } - - return ft_finalize(); -} - int main(int argc, char **argv) { int op, ret; @@ -101,13 +73,12 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - ret = run(); + ret = run_pingpong(); ft_free_res(); return ft_exit_code(ret); diff --git a/fabtests/benchmarks/rdm_tagged_bw.c b/fabtests/benchmarks/rdm_tagged_bw.c index 239f0fef866..e40a9be025b 100644 --- a/fabtests/benchmarks/rdm_tagged_bw.c +++ b/fabtests/benchmarks/rdm_tagged_bw.c @@ -105,9 +105,8 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_TAGGED; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_tagged_pingpong.c b/fabtests/benchmarks/rdm_tagged_pingpong.c index a54725e5186..70c4ac6dec2 100644 --- a/fabtests/benchmarks/rdm_tagged_pingpong.c +++ b/fabtests/benchmarks/rdm_tagged_pingpong.c @@ -36,36 +36,6 @@ #include #include "benchmark_shared.h" -static int run(void) -{ - int i, ret = 0; - - ret = ft_init_fabric(); - if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - goto out; - } - } else { - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - goto out; - } - - ft_finalize(); -out: - return ret; -} - int main(int argc, char **argv) { int op, ret; @@ -103,13 +73,12 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_TAGGED; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - ret = run(); + ret = run_pingpong(); ft_free_res(); return ft_exit_code(ret); diff --git a/fabtests/benchmarks/rma_bw.c b/fabtests/benchmarks/rma_bw.c index 8247dd79a0c..7d0764f5267 100644 --- a/fabtests/benchmarks/rma_bw.c +++ b/fabtests/benchmarks/rma_bw.c @@ -93,7 +93,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG | FI_RMA; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rma_pingpong.c b/fabtests/benchmarks/rma_pingpong.c index be4cd71302c..07e9564e9b8 100644 --- a/fabtests/benchmarks/rma_pingpong.c +++ b/fabtests/benchmarks/rma_pingpong.c @@ -93,8 +93,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG | FI_RMA | FI_WRITE | FI_REMOTE_WRITE; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - hints->mode = FI_CONTEXT; - hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->addr_format = opts.address_format; while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS API_OPTS diff --git a/fabtests/common/check_hmem.c b/fabtests/common/check_hmem.c index b7a97bc8e79..5e319473c4e 100644 --- a/fabtests/common/check_hmem.c +++ b/fabtests/common/check_hmem.c @@ -44,7 +44,7 @@ int main(int argc, char** argv) return EXIT_FAILURE; hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; while ((op = getopt(argc, argv, "p:h")) != -1) { switch (op) { case 'p': diff --git a/fabtests/common/hmem.c b/fabtests/common/hmem.c index 1c724510c4e..0cd736441b4 100644 --- a/fabtests/common/hmem.c +++ b/fabtests/common/hmem.c @@ -185,6 +185,10 @@ int ft_hmem_free_host(enum fi_hmem_iface iface, void *buf) return hmem_ops[iface].free_host(buf); } +/* + * Matches the behavior of memset where value is an int but + * used as a unsigned char + */ int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf, int value, size_t size) { diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c index 2f02b6f474c..e4aef962fb6 100644 --- a/fabtests/common/hmem_cuda.c +++ b/fabtests/common/hmem_cuda.c @@ -157,9 +157,9 @@ int ft_cuda_init(void) goto err; } - cuda_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_handle) { - FT_ERR("Failed to dlopen libcuda.so\n"); + FT_ERR("Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cudart; } diff --git a/fabtests/common/hmem_synapseai.c b/fabtests/common/hmem_synapseai.c index 62a34b7358b..c6858638784 100644 --- a/fabtests/common/hmem_synapseai.c +++ b/fabtests/common/hmem_synapseai.c @@ -41,31 +41,6 @@ #include "habanalabs/synapse_api.h" #include "habanalabs/hlthunk.h" -#define SCAL_SUCCESS 0 - -#define DECLARE_HANDLE(name) struct name##__ { int unused; }; \ - typedef struct name##__ *name - -DECLARE_HANDLE(scal_handle_t); -DECLARE_HANDLE(scal_pool_handle_t); - -typedef struct _scal_memory_pool_infoV2 -{ - scal_handle_t scal; - const char * name; - unsigned idx; - uint64_t device_base_address; - void *host_base_address; - uint32_t core_base_address; // 0 when the pool is not mapped to the cores - uint64_t totalSize; - uint64_t freeSize; - uint64_t device_base_allocated_address; -} scal_memory_pool_infoV2; - -int scal_get_handle_from_fd(int fd, scal_handle_t* scal); -int scal_get_pool_handle_by_name(const scal_handle_t scal, const char *pool_name, scal_pool_handle_t *pool); -int scal_pool_get_infoV2(const scal_pool_handle_t pool, scal_memory_pool_infoV2 *info); - #define ACCEL_PAGE_SIZE 4096 struct synapseai_ops { synStatus (*synInitialize)(void); @@ -88,20 +63,14 @@ struct synapseai_ops { synStatus (*synDeviceGetInfoV2)(const synDeviceId deviceId, synDeviceInfoV2 *pDeviceInfo); int (*hlthunk_device_mapped_memory_export_dmabuf_fd)(int fd, uint64_t addr, uint64_t size, uint64_t offset, uint32_t flags); - int (*scal_pool_get_infoV2)(const scal_pool_handle_t pool, scal_memory_pool_infoV2 *info); - int (*scal_get_pool_handle_by_name)(const scal_handle_t scal, const char *pool_name, - scal_pool_handle_t *pool); - int (*scal_get_handle_from_fd)(int fd, scal_handle_t *scal); }; static void *synapseai_handle; static void *hlthunk_handle; -static void *scal_handle; static struct synapseai_ops synapseai_ops; static synDeviceId synapseai_fd = -1; static synStreamHandle synapseai_stream_handle; static synDeviceInfoV2 deviceInfo; -static uint64_t device_fd; static void cleanup_synapseai_ops(void) { @@ -114,11 +83,6 @@ static void cleanup_synapseai_ops(void) dlclose(hlthunk_handle); hlthunk_handle = NULL; } - - if (scal_handle) { - dlclose(scal_handle); - scal_handle = NULL; - } } int init_synapseai_ops(void) @@ -215,31 +179,6 @@ int init_synapseai_ops(void) goto err_dlclose; } - scal_handle = dlopen("libscal.so", RTLD_NOW); - if (!scal_handle) { - FT_ERR("Falid to dlopen libscal.so\n"); - goto err_dlclose; - } - - synapseai_ops.scal_pool_get_infoV2 = dlsym(scal_handle, "scal_pool_get_infoV2"); - if (!synapseai_ops.scal_pool_get_infoV2) { - FT_ERR("Failed to find scal_pool_get_infoV2\n"); - goto err_dlclose; - } - - synapseai_ops.scal_get_pool_handle_by_name = - dlsym(scal_handle, "scal_get_pool_handle_by_name"); - if (!synapseai_ops.scal_get_pool_handle_by_name) { - FT_ERR("Failed to find scal_get_pool_handle_by_name\n"); - goto err_dlclose; - } - - synapseai_ops.scal_get_handle_from_fd = dlsym(scal_handle, "scal_get_handle_from_fd"); - if (!synapseai_ops.scal_get_handle_from_fd) { - FT_ERR("Failed to find scal_get_handle_from_fd\n"); - goto err_dlclose; - } - return FI_SUCCESS; err_dlclose: @@ -284,7 +223,6 @@ int ft_synapseai_init(void) FT_ERR("Failed to synDeviceGetInfoV2()\n"); goto err; } - device_fd = deviceInfo.fd; if (synapseai_ops.synStreamCreateGeneric(&synapseai_stream_handle, synapseai_fd, 0) != synSuccess) { @@ -383,29 +321,12 @@ int ft_synapseai_copy_from_hmem(uint64_t device, void *dst, const void *src, siz int ft_synapseai_get_dmabuf_fd(void *buf, size_t len, int *dmabuf_fd, uint64_t *dmabuf_offset) { - scal_pool_handle_t mpHandle; - scal_memory_pool_infoV2 mpInfo; - scal_handle_t a = 0; - - if (synapseai_ops.scal_get_handle_from_fd(device_fd, &a) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - - if (synapseai_ops.scal_get_pool_handle_by_name(a, "global_hbm", &mpHandle) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - - if (synapseai_ops.scal_pool_get_infoV2(mpHandle, &mpInfo) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - uint64_t baseAddress = mpInfo.device_base_allocated_address; - size_t buf_size = (len + ACCEL_PAGE_SIZE - 1) & ~(ACCEL_PAGE_SIZE - 1); *dmabuf_fd = - synapseai_ops.hlthunk_device_mapped_memory_export_dmabuf_fd(device_fd, - baseAddress, + synapseai_ops.hlthunk_device_mapped_memory_export_dmabuf_fd(deviceInfo.fd, + deviceInfo.globalHbmBaseAddress, buf_size, - (uint64_t)buf - baseAddress, + (uint64_t)buf - deviceInfo.globalHbmBaseAddress, (O_RDWR | O_CLOEXEC)); if (*dmabuf_fd < 0) { diff --git a/fabtests/common/hmem_ze.c b/fabtests/common/hmem_ze.c index 507470f06d1..305c58ff57d 100644 --- a/fabtests/common/hmem_ze.c +++ b/fabtests/common/hmem_ze.c @@ -382,6 +382,7 @@ int ft_ze_free(void *buf) int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) { + unsigned char set_value = (unsigned char) value; ze_result_t ze_ret; ze_ret = (*libze_ops.zeCommandListReset)(cmd_list); @@ -389,7 +390,8 @@ int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) return -FI_EINVAL; ze_ret = (*libze_ops.zeCommandListAppendMemoryFill)( - cmd_list, buf, &value, sizeof(value), + cmd_list, buf, &set_value, + sizeof(set_value), size, NULL, 0, NULL); if (ze_ret) return -FI_EINVAL; diff --git a/fabtests/ubertest/ofi_atomic.c b/fabtests/common/ofi_atomic.c similarity index 98% rename from fabtests/ubertest/ofi_atomic.c rename to fabtests/common/ofi_atomic.c index 311747175d5..8483284f8ff 100644 --- a/fabtests/ubertest/ofi_atomic.c +++ b/fabtests/common/ofi_atomic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -141,7 +141,7 @@ ofi_complex_##type *r = (res); \ OFI_UNUSED(src); \ for (i = 0; i < cnt; i++) \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ } /* @@ -173,7 +173,7 @@ const ofi_complex_##type *s = (src); \ ofi_complex_##type *r = (res); \ for (i = 0; i < cnt; i++) { \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ op(type, d[i], s[i]); \ } \ } @@ -211,7 +211,7 @@ const ofi_complex_##type *c = (cmp); \ ofi_complex_##type *r = (res); \ for (i = 0; i < cnt; i++) { \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ op(type, d[i], s[i], c[i]); \ } \ } diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index e4d45c46e42..dc99eb574a0 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -73,7 +73,7 @@ struct fid_eq *eq; struct fid_mc *mc; struct fid_mr no_mr; -struct fi_context tx_ctx, rx_ctx; +struct fi_context2 tx_ctx, rx_ctx; struct ft_context *tx_ctx_arr = NULL, *rx_ctx_arr = NULL; uint64_t remote_cq_data = 0; @@ -90,7 +90,7 @@ char *buf = NULL, *tx_buf, *rx_buf; * dev_host_buf are used by ft_fill_buf() to stage data sent over wire, * when tx_buf is on device memory. */ -void *dev_host_buf = NULL; +void *dev_host_buf = NULL, *dev_host_comp = NULL, *dev_host_res = NULL; char **tx_mr_bufs = NULL, **rx_mr_bufs = NULL; size_t buf_size, tx_buf_size, rx_buf_size; @@ -161,10 +161,6 @@ struct test_size_param *test_size = def_test_sizes; /* range of messages is dynamically allocated */ struct test_size_param *user_test_sizes; -static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; -static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1; - - int ft_poll_fd(int fd, int timeout) { struct pollfd fds; @@ -276,16 +272,10 @@ static inline int ft_rma_write_target_allowed(uint64_t caps) return 0; } -static inline int ft_check_mr_local_flag(struct fi_info *info) -{ - return ((info->mode & FI_LOCAL_MR) || - (info->domain_attr->mr_mode & FI_MR_LOCAL)); -} - uint64_t ft_info_to_mr_access(struct fi_info *info) { uint64_t mr_access = 0; - if (ft_check_mr_local_flag(info)) { + if (info->domain_attr->mr_mode & FI_MR_LOCAL) { if (info->caps & (FI_MSG | FI_TAGGED)) { if (info->caps & FT_MSG_MR_ACCESS) { mr_access |= info->caps & FT_MSG_MR_ACCESS; @@ -556,14 +546,30 @@ static void ft_set_tx_rx_sizes(size_t *set_tx, size_t *set_rx) *set_tx += ft_tx_prefix_size(); } -void ft_free_host_tx_buf(void) +void ft_free_host_bufs(void) { int ret; - ret = ft_hmem_free_host(opts.iface, dev_host_buf); - if (ret) - FT_PRINTERR("ft_hmem_free_host", ret); - dev_host_buf = NULL; + if (dev_host_buf) { + ret = ft_hmem_free_host(opts.iface, dev_host_buf); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_buf = NULL; + } + + if (dev_host_res) { + ret = ft_hmem_free_host(opts.iface, dev_host_res); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_res = NULL; + } + + if (dev_host_comp) { + ret = ft_hmem_free_host(opts.iface, dev_host_comp); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_comp = NULL; + } } /* @@ -651,6 +657,18 @@ int ft_alloc_msgs(void) max_msg_size * opts.window_size); if (ret) return ret; + + if (fi->caps & FI_ATOMIC) { + ret = ft_hmem_alloc_host(opts.iface, &dev_host_comp, + buf_size); + if (ret) + return ret; + + ret = ft_hmem_alloc_host(opts.iface, &dev_host_res, + buf_size); + if (ret) + return ret; + } } ret = ft_hmem_memset(opts.iface, opts.device, (void *) buf, 0, buf_size); @@ -1046,6 +1064,8 @@ int ft_getinfo(struct fi_info *hints, struct fi_info **info) hints->domain_attr->mr_mode |= FI_MR_HMEM; } + hints->domain_attr->threading = opts.threading; + ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); if (ret) { FT_PRINTERR("fi_getinfo", ret); @@ -1071,6 +1091,13 @@ int ft_init_fabric_cm(void) ret = opts.dst_addr ? ft_client_connect() : ft_server_connect(); + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX) && + !ft_check_opts(FT_OPT_SKIP_MSG_ALLOC) && + (fi->caps & (FI_MSG | FI_TAGGED))) { + ret = ft_sync_inband(false); + if (ret) + return ret; + } return ret; } @@ -1344,6 +1371,14 @@ int ft_init_fabric(void) if (ft_check_opts(FT_OPT_FORK_CHILD)) ft_fork_child(); + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX) && + !ft_check_opts(FT_OPT_SKIP_MSG_ALLOC) && + (fi->caps & (FI_MSG | FI_TAGGED))) { + ret = ft_sync_inband(false); + if (ret) + return ret; + } + return 0; } @@ -1428,7 +1463,8 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av * } if (opts.max_msg_size) { - ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &opts.max_msg_size, sizeof opts.max_msg_size); + ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, + &opts.max_msg_size, sizeof opts.max_msg_size); if (ret && ret != -FI_EOPNOTSUPP) { FT_PRINTERR("fi_setopt(FI_OPT_MAX_MSG_SIZE)", ret); return ret; @@ -1450,6 +1486,15 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av * } } + if (opts.min_multi_recv_size) { + ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, + &opts.min_multi_recv_size, sizeof opts.min_multi_recv_size); + if (ret && ret != -FI_EOPNOTSUPP) { + FT_PRINTERR("fi_setopt(FI_OPT_MIN_MULTI_RECV_SIZE)", ret); + return ret; + } + } + ret = fi_enable(bind_ep); if (ret) { FT_PRINTERR("fi_enable", ret); @@ -1712,43 +1757,66 @@ int ft_init_av_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr, return 0; } -int ft_exchange_keys(struct fi_rma_iov *peer_iov) +int ft_fill_rma_info(struct fid_mr *mr, void *mr_buf, + struct fi_rma_iov *rma_iov, size_t *key_size, + size_t *rma_iov_len) { - char temp[FT_MAX_CTRL_MSG]; - struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; - size_t key_size = 0, len; uint64_t addr; + size_t buf_len = *rma_iov_len; int ret; if (fi->domain_attr->mr_mode & FI_MR_RAW) { - ret = fi_mr_raw_attr(mr, &addr, NULL, &key_size, 0); + *key_size = 0; + ret = fi_mr_raw_attr(mr, &addr, NULL, key_size, 0); if (ret != -FI_ETOOSMALL) return ret; - len = sizeof(*rma_iov) + key_size - sizeof(rma_iov->key); - if (len > FT_MAX_CTRL_MSG) { + *rma_iov_len = sizeof(*rma_iov) + *key_size - sizeof(rma_iov->key); + if (*rma_iov_len > buf_len) { FT_PRINTERR("Raw key too large for ctrl message", -FI_ETOOSMALL); return -FI_ETOOSMALL; } - } else { - len = sizeof(*rma_iov); - } - - if ((fi->domain_attr->mr_mode == FI_MR_BASIC) || - (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) { - rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size(); - } else { - rma_iov->addr = 0; - } - if (fi->domain_attr->mr_mode & FI_MR_RAW) { ret = fi_mr_raw_attr(mr, &addr, (uint8_t *) &rma_iov->key, - &key_size, 0); + key_size, 0); if (ret) return ret; } else { rma_iov->key = fi_mr_key(mr); + *key_size = sizeof(rma_iov->key); + *rma_iov_len = sizeof(*rma_iov); + } + + rma_iov->addr = fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? + (uintptr_t) mr_buf : 0; + + return FI_SUCCESS; +} + +int ft_get_rma_info(struct fi_rma_iov *rma_iov, + struct fi_rma_iov *peer_iov, size_t key_size) +{ + if (fi->domain_attr->mr_mode & FI_MR_RAW) { + peer_iov->addr = rma_iov->addr; + peer_iov->len = rma_iov->len; + return fi_mr_map_raw(domain, rma_iov->addr, + (uint8_t *) &rma_iov->key, key_size, + &peer_iov->key, 0); } + *peer_iov = *rma_iov; + return FI_SUCCESS; +} + +int ft_exchange_keys(struct fi_rma_iov *peer_iov) +{ + char temp[FT_MAX_CTRL_MSG]; + struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; + size_t key_size, len = FT_MAX_CTRL_MSG; + int ret; + + ret = ft_fill_rma_info(mr, rx_buf, rma_iov, &key_size, &len); + if (ret) + return ret; ret = ft_hmem_copy_to(opts.iface, opts.device, tx_buf + ft_tx_prefix_size(), temp, len); @@ -1768,17 +1836,9 @@ int ft_exchange_keys(struct fi_rma_iov *peer_iov) if (ret) return ret; - if (fi->domain_attr->mr_mode & FI_MR_RAW) { - peer_iov->addr = rma_iov->addr; - peer_iov->len = rma_iov->len; - ret = fi_mr_map_raw(domain, rma_iov->addr, - (uint8_t *) &rma_iov->key, key_size, - &peer_iov->key, 0); - if (ret) - return ret; - } else { - *peer_iov = *rma_iov; - } + ret = ft_get_rma_info(rma_iov, peer_iov, key_size); + if (ret) + return ret; ret = ft_post_rx(ep, rx_size, &rx_ctx); if (ret) @@ -1891,8 +1951,7 @@ void ft_free_res(void) buf = rx_buf = tx_buf = NULL; buf_size = rx_size = tx_size = tx_mr_size = rx_mr_size = 0; } - if (dev_host_buf) - ft_free_host_tx_buf(); + ft_free_host_bufs(); if (fi_pep) { fi_freeinfo(fi_pep); @@ -2422,18 +2481,18 @@ ssize_t ft_post_atomic(enum ft_atomic_opcodes opcode, struct fid_ep *ep, switch (opcode) { case FT_ATOMIC_BASE: FT_POST(fi_atomic, ft_progress, txcq, tx_seq, &tx_cq_cntr, - "fi_atomic", ep, buf, count, mr_desc, remote_fi_addr, + "fi_atomic", ep, tx_buf, count, mr_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); break; case FT_ATOMIC_FETCH: FT_POST(fi_fetch_atomic, ft_progress, txcq, tx_seq, &tx_cq_cntr, - "fi_fetch_atomic", ep, buf, count, mr_desc, result, + "fi_fetch_atomic", ep, tx_buf, count, mr_desc, result, result_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); break; case FT_ATOMIC_COMPARE: FT_POST(fi_compare_atomic, ft_progress, txcq, tx_seq, - &tx_cq_cntr, "fi_compare_atomic", ep, buf, count, + &tx_cq_cntr, "fi_compare_atomic", ep, tx_buf, count, mr_desc, compare, compare_desc, result, result_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); @@ -2864,7 +2923,7 @@ int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void } int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, - void *buf, size_t size, void *ctx, int flags) + void *buf, size_t size, void *ctx, uint64_t flags) { struct fi_msg msg; struct fi_msg_tagged tagged_msg; @@ -2903,14 +2962,14 @@ int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, } -int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, - size_t size, void *ctx, int flags) +int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, + size_t size, void *ctx, uint64_t flags) { struct fi_msg msg; struct fi_msg_tagged tagged_msg; struct iovec msg_iov; - msg_iov.iov_base = rx_buf; + msg_iov.iov_base = (char *) buf; msg_iov.iov_len = size; if (hints->caps & FI_TAGGED) { @@ -2920,7 +2979,7 @@ int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, tagged_msg.addr = fi_addr; tagged_msg.data = NO_CQ_DATA; tagged_msg.context = ctx; - tagged_msg.tag = ft_tag ? ft_tag : tx_seq; + tagged_msg.tag = ft_tag ? ft_tag : rx_seq; tagged_msg.ignore = 0; FT_POST(fi_trecvmsg, ft_progress, rxcq, rx_seq, @@ -3001,49 +3060,74 @@ void eq_readerr(struct fid_eq *eq, const char *eq_str) } } -int ft_sync() +int ft_sync_oob(void) { char buf = 'a'; int ret; if (opts.dst_addr) { - if (!(opts.options & FT_OPT_OOB_SYNC)) { - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, - FI_DELIVERY_COMPLETE); - if (ret) - return ret; + ret = ft_sock_send(oob_sock, &buf, 1); + if (ret) + return ret; - ret = ft_rx(ep, 1); - } else { - ret = ft_sock_send(oob_sock, &buf, 1); - if (ret) - return ret; + ret = ft_sock_recv(oob_sock, &buf, 1); + if (ret) + return ret; + } else { + ret = ft_sock_recv(oob_sock, &buf, 1); + if (ret) + return ret; - ret = ft_sock_recv(oob_sock, &buf, 1); - if (ret) - return ret; - } + ret = ft_sock_send(oob_sock, &buf, 1); + if (ret) + return ret; + } + + return FI_SUCCESS; +} + +int ft_sync_inband(bool repost_rx) +{ + int ret; + + if (opts.dst_addr) { + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 0, &tx_ctx, + FI_DELIVERY_COMPLETE); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; } else { - if (!(opts.options & FT_OPT_OOB_SYNC)) { - ret = ft_rx(ep, 1); - if (ret) - return ret; + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, - FI_DELIVERY_COMPLETE); - if (ret) - return ret; - } else { - ret = ft_sock_recv(oob_sock, &buf, 1); - if (ret) - return ret; + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 0, &tx_ctx, + FI_DELIVERY_COMPLETE); + if (ret) + return ret; + } - ret = ft_sock_send(oob_sock, &buf, 1); - if (ret) - return ret; - } + if (repost_rx) { + ret = ft_post_rx(ep, rx_size, &rx_ctx); + if (ret) + return ret; } + return FI_SUCCESS; +} + +int ft_sync() +{ + int ret; + + if (ft_check_opts(FT_OPT_OOB_SYNC)) + ret = ft_sync_oob(); + else + ret = ft_sync_inband(true); + return ret; } @@ -3147,7 +3231,7 @@ int ft_wait_child(void) int ft_finalize_ep(struct fid_ep *ep) { int ret; - struct fi_context ctx; + struct fi_context2 ctx; ret = ft_sendmsg(ep, remote_fi_addr, tx_buf, 4, &ctx, FI_TRANSMIT_COMPLETE); if (ret) @@ -3687,6 +3771,189 @@ int ft_fill_buf(void *buf, size_t size) return ret; } +int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype) +{ + void *fill_buf; + int ret = 0; + + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_buf); + fill_buf = dev_host_buf; + } else { + fill_buf = buf; + } + + switch (datatype) { + case FI_INT8: + case FI_UINT8: + case FI_INT16: + case FI_UINT16: + case FI_INT32: + case FI_UINT32: + case FI_INT64: + case FI_UINT64: + case FI_INT128: + case FI_UINT128: + case FI_FLOAT: + case FI_DOUBLE: + case FI_LONG_DOUBLE: + SWITCH_REAL_TYPES(datatype, FT_FILL, fill_buf, count); + break; + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + case FI_LONG_DOUBLE_COMPLEX: + SWITCH_COMPLEX_TYPES(datatype, FT_FILL_COMPLEX, fill_buf, count); + break; + default: + return -FI_EOPNOTSUPP; + } + + if (opts.iface != FI_HMEM_SYSTEM) { + ret = ft_hmem_copy_to(opts.iface, opts.device, buf, fill_buf, + count * datatype_to_size(datatype)); + if (ret) + FT_ERR("Failed to fill atomic buffer\n"); + } + return ret; +} + +static int ft_check_atomic_compare(void *buf, void *cmp, + enum fi_datatype datatype, size_t count) +{ + switch (datatype) { + case FI_INT8: + case FI_UINT8: + case FI_INT16: + case FI_UINT16: + case FI_INT32: + case FI_UINT32: + case FI_INT64: + case FI_UINT64: + case FI_INT128: + case FI_UINT128: + case FI_FLOAT: + case FI_DOUBLE: + case FI_LONG_DOUBLE: + SWITCH_REAL_TYPES(datatype, FT_CHECK, buf, cmp, count); + break; + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + case FI_LONG_DOUBLE_COMPLEX: + SWITCH_COMPLEX_TYPES(datatype, FT_CHECK_COMPLEX, buf, cmp, + count); + break; + default: + return -FI_EOPNOTSUPP; + } + return 0; +} + +int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, + enum fi_datatype type, void *src, void *dst_cpy, void *dst, + void *cmp, void *res, size_t count) +{ + int ret = 0; + void *check_res = res, *check_buf, *check_comp = cmp; + + /* + * If we don't have the test function, return > 0 to indicate + * verification is unsupported. + */ + if (atomic == FT_ATOMIC_COMPARE) { + if (!ofi_atomic_swap_handler(op, type)) + return 1; + } else if (atomic == FT_ATOMIC_FETCH) { + if (!ofi_atomic_readwrite_handler(op, type)) + return 1; + } else { + if (!ofi_atomic_write_handler(op, type)) + return 1; + } + + if (atomic == FT_ATOMIC_COMPARE || atomic == FT_ATOMIC_FETCH) { + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_res); + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_res, res, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_res = dev_host_res; + } else { + check_res = res; + } + if (ft_check_atomic_compare(dst_cpy, check_res, type, count)) { + printf("Data check error on atomic fetch buffer\n"); + return -1; + } + } + + if (atomic == FT_ATOMIC_COMPARE) { + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_comp); + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_comp, cmp, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + check_comp = dev_host_comp; + } else { + check_comp = cmp; + } + } + + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_buf); + ret = ft_hmem_copy_from(opts.iface, opts.device, dev_host_buf, + src, count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_buf = dev_host_buf; + + } else { + check_buf = src; + } + + if (atomic == FT_ATOMIC_COMPARE) { + ofi_atomic_swap_op(op, type, dst_cpy, check_buf, check_comp, + check_res, count); + } else if (atomic == FT_ATOMIC_FETCH) { + ofi_atomic_readwrite_op(op, type, dst_cpy, check_buf, + check_res, count); + } else { + ofi_atomic_write_op(op, type, dst_cpy, check_buf, count); + } + + if (opts.iface != FI_HMEM_SYSTEM) { + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_buf, dst, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_buf = dev_host_buf; + } else { + check_buf = dst; + } + + if (ft_check_atomic_compare(dst_cpy, check_buf, type, count)) { + printf("Data check error on atomic target buffer\n"); + return -1; + } + + return FI_SUCCESS; +} + int ft_check_buf(void *buf, size_t size) { char *recv_data; @@ -4231,6 +4498,8 @@ void ft_longopts_usage() "maximum untagged message size"); FT_PRINT_OPTS_USAGE("--use-fi-more", "Run tests with FI_MORE"); + FT_PRINT_OPTS_USAGE("--threading", + "threading model: safe|completion|domain (default:domain)"); } int debug_assert; @@ -4244,6 +4513,7 @@ struct option long_opts[] = { {"control-progress", required_argument, NULL, LONG_OPT_CONTROL_PROGRESS}, {"max-msg-size", required_argument, NULL, LONG_OPT_MAX_MSG_SIZE}, {"use-fi-more", no_argument, NULL, LONG_OPT_USE_FI_MORE}, + {"threading", required_argument, NULL, LONG_OPT_THREADING}, {NULL, 0, NULL, 0}, }; @@ -4261,6 +4531,20 @@ int ft_parse_progress_model_string(char* progress_str) return ret; } +static int ft_parse_threading_string(char* threading_str) +{ + int ret = -1; + + if (!strcasecmp("safe", threading_str)) + ret = FI_THREAD_SAFE; + else if (!strcasecmp("completion", threading_str)) + ret = FI_THREAD_COMPLETION; + else if (!strcasecmp("domain", threading_str)) + ret = FI_THREAD_DOMAIN; + + return ret; +} + int ft_parse_long_opts(int op, char *optarg) { switch (op) { @@ -4287,6 +4571,9 @@ int ft_parse_long_opts(int op, char *optarg) case LONG_OPT_USE_FI_MORE: opts.use_fi_more = 1; return 0; + case LONG_OPT_THREADING: + opts.threading = ft_parse_threading_string(optarg); + return 0; default: return EXIT_FAILURE; } diff --git a/fabtests/configure.ac b/fabtests/configure.ac index bfeeda6dda8..29f816b4993 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -5,7 +5,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [2.0.0b1], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [2.1.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) @@ -54,6 +54,20 @@ AS_IF([test x"$enable_debug" != x"no"], AC_DEFINE_UNQUOTED([ENABLE_DEBUG], [$dbg], [defined to 1 if configured with --enable-debug]) +AC_ARG_ENABLE([efa], + [AS_HELP_STRING([--enable-efa], + [Enable efa provider specific tests - default YES])], + [], [enable_efa=yes]) + +AM_CONDITIONAL([ENABLE_EFA], [test x"$enable_efa" = x"yes"]) + +AC_ARG_ENABLE([lpp], + [AS_HELP_STRING([--enable-lpp], + [Enable lpp provider specific tests - default YES])], + [], [enable_lpp=yes]) + +AM_CONDITIONAL([ENABLE_LPP], [test x"$enable_lpp" = x"yes"]) + AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[ AC_ARG_ENABLE([$1], [AS_HELP_STRING([--enable-$1], @@ -80,6 +94,9 @@ AS_IF([test -z "$CFLAGS"], # <3), but it is necessary in AM 1.12.x. m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) +dnl Call the provider's CONFIGURE and CONDITIONALS macros +m4_include([prov/efa/configure.m4]) + AM_PROG_LIBTOOL AC_ARG_WITH([valgrind], diff --git a/fabtests/fabtests.vcxproj b/fabtests/fabtests.vcxproj index 684fba50966..65b0af71be0 100644 --- a/fabtests/fabtests.vcxproj +++ b/fabtests/fabtests.vcxproj @@ -227,12 +227,10 @@ - - @@ -241,7 +239,7 @@ - + diff --git a/fabtests/fabtests.vcxproj.filters b/fabtests/fabtests.vcxproj.filters index ec7718e9c51..e113cbf898e 100644 --- a/fabtests/fabtests.vcxproj.filters +++ b/fabtests/fabtests.vcxproj.filters @@ -81,9 +81,6 @@ Source Files\functional - - Source Files\functional - Source Files\functional @@ -93,9 +90,6 @@ Source Files\functional - - Source Files\functional - Source Files\functional @@ -231,7 +225,7 @@ Source Files\functional - + Source Files\functional diff --git a/fabtests/functional/av_xfer.c b/fabtests/functional/av_xfer.c index d6e341b4381..cb3a6491993 100644 --- a/fabtests/functional/av_xfer.c +++ b/fabtests/functional/av_xfer.c @@ -235,7 +235,7 @@ int main(int argc, char **argv) hints->caps = hints->ep_attr->type == FI_EP_RDM ? FI_TAGGED : FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; base_hints = hints; diff --git a/fabtests/functional/cq_data.c b/fabtests/functional/cq_data.c index ca35ccb06d3..b700778a815 100644 --- a/fabtests/functional/cq_data.c +++ b/fabtests/functional/cq_data.c @@ -164,7 +164,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->domain_attr->cq_data_size = 4; /* required minimum */ - hints->mode |= FI_CONTEXT | FI_RX_CQ_DATA; + hints->mode |= FI_CONTEXT | FI_CONTEXT2 | FI_RX_CQ_DATA; hints->caps = FI_MSG; if (opts.cqdata_op == FT_CQDATA_WRITEDATA) diff --git a/fabtests/functional/dgram.c b/fabtests/functional/dgram.c index b9503cd65fc..47b77379d9a 100644 --- a/fabtests/functional/dgram.c +++ b/fabtests/functional/dgram.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_DGRAM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/dgram_waitset.c b/fabtests/functional/dgram_waitset.c deleted file mode 100644 index 8b72d76e254..00000000000 --- a/fabtests/functional/dgram_waitset.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. - * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -#include - -static int alloc_ep_res(struct fi_info *fi) -{ - struct fi_wait_attr wait_attr; - int ret; - - memset(&wait_attr, 0, sizeof wait_attr); - wait_attr.wait_obj = FI_WAIT_UNSPEC; - ret = fi_wait_open(fabric, &wait_attr, &waitset); - if (ret) { - FT_PRINTERR("fi_wait_open", ret); - return ret; - } - - ret = ft_alloc_active_res(fi); - if (ret) - return ret; - - return 0; -} - -static int init_fabric(void) -{ - int ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - ret = ft_open_fabric_res(); - if (ret) - return ret; - - ret = alloc_ep_res(fi); - if (ret) - return ret; - - ret = ft_enable_ep_recv(); - if (ret) - return ret; - - return 0; -} - -static int send_recv() -{ - struct fi_cq_entry comp; - int ret; - - ret = fi_recv(ep, rx_buf, rx_size + ft_rx_prefix_size(), - mr_desc, 0, &rx_ctx); - if (ret) - return ret; - - ft_sync(); - - fprintf(stdout, "Posting a send...\n"); - ret = ft_post_tx(ep, remote_fi_addr, tx_size, NO_CQ_DATA, &tx_ctx); - if (ret) - return ret; - - while ((tx_cq_cntr < tx_seq) || (rx_cq_cntr < rx_seq)) { - /* Wait for completion events on CQs */ - ret = fi_wait(waitset, -1); - if (ret < 0) { - FT_PRINTERR("fi_wait", ret); - return ret; - } - - /* Read the send completion entry */ - ret = fi_cq_read(txcq, &comp, 1); - if (ret > 0) { - tx_cq_cntr++; - fprintf(stdout, "Received send completion event!\n"); - } else if (ret < 0 && ret != -FI_EAGAIN) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(txcq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - - /* Read the recv completion entry */ - ret = fi_cq_read(rxcq, &comp, 1); - if (ret > 0) { - rx_cq_cntr++; - fprintf(stdout, "Received recv completion event!\n"); - } else if (ret < 0 && ret != -FI_EAGAIN) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(rxcq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - } - - return 0; -} - -static int run(void) -{ - int ret; - - ret = init_fabric(); - if (ret) - return ret; - - ret = ft_init_av(); - if (ret) - return ret; - - return send_recv(); -} - -int main(int argc, char **argv) -{ - int op, ret = 0; - - opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE; - opts.comp_method = FT_COMP_WAITSET; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "h" ADDR_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - break; - case '?': - case 'h': - ft_usage(argv[0], "A DGRAM client-server example that uses waitset.\n"); - return EXIT_FAILURE; - } - } - - if (optind < argc) - opts.dst_addr = argv[optind]; - - hints->ep_attr->type = FI_EP_DGRAM; - hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; - hints->domain_attr->mr_mode = opts.mr_mode; - hints->addr_format = opts.address_format; - - ret = run(); - - ft_free_res(); - return ft_exit_code(ret); -} diff --git a/fabtests/functional/bw.c b/fabtests/functional/flood.c similarity index 69% rename from fabtests/functional/bw.c rename to fabtests/functional/flood.c index 04745e61374..dccb06076c1 100644 --- a/fabtests/functional/bw.c +++ b/fabtests/functional/flood.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under the BSD license * below: @@ -34,7 +34,7 @@ #include -int sleep_time = 0; +static int sleep_time = 0; static ssize_t post_one_tx(struct ft_context *msg) { @@ -99,27 +99,96 @@ static int post_rx_sync(void) return ret; } -static int run_loop(void) +static void mr_close_all(struct ft_context *ctx_arr, int window_size) +{ + int i; + + for (i = 0; i < window_size; i++) + FT_CLOSE_FID(ctx_arr[i].mr); +} + +static int run_seq_mr_send(void) { + + int ret; + int i; + + mr_close_all(tx_ctx_arr, opts.window_size); + mr_close_all(rx_ctx_arr, opts.window_size); + + printf("Sequential memory registration:"); + if (opts.dst_addr) { + for (i = 0; i < opts.window_size; i++) { + ret = ft_reg_mr(fi, tx_ctx_arr[i].buf, tx_mr_size, + ft_info_to_mr_access(fi), + FT_TX_MR_KEY + i, opts.iface, opts.device, + &(tx_ctx_arr[i].mr), &(tx_ctx_arr[i].desc)); + if (ret) + goto out; + + ret = post_one_tx(&tx_ctx_arr[i]); + if (ret) + goto out; + + ret = ft_get_tx_comp(tx_seq); + if (ret) + goto out; + + FT_CLOSE_FID(tx_ctx_arr[i].mr); + } + } else { + for (i = 0; i < opts.window_size; i++) { + ret = ft_reg_mr(fi, rx_ctx_arr[i].buf, rx_mr_size, + ft_info_to_mr_access(fi), FT_RX_MR_KEY + i, opts.iface, opts.device, + &(rx_ctx_arr[i].mr), + &(rx_ctx_arr[i].desc)); + if (ret) + goto out; + + ret = ft_post_rx_buf(ep, opts.transfer_size, + &(rx_ctx_arr[i].context), + rx_ctx_arr[i].buf, + rx_ctx_arr[i].desc, ft_tag); + if (ret) + goto out; + + ret = wait_check_rx_bufs(); + if (ret) + goto out; + + FT_CLOSE_FID(rx_ctx_arr[i].mr); + } + } + if (opts.options & FT_OPT_OOB_SYNC) + ret = ft_sync(); + else + ret = post_rx_sync(); +out: + printf("%s\n", ret ? "Fail" : "Pass"); + return ret; +} + +static int run_batch_mr_send(void) { int ret, i; /* Receive side delay is used in order to let the sender - get ahead of the receiver and post multiple sends - before the receiver begins processing them. */ + * get ahead of the receiver and post multiple sends + * before the receiver begins processing them. + */ if (!opts.dst_addr) sleep(sleep_time); - ft_start(); + printf("Batch memory registration:"); if (opts.dst_addr) { for (i = 0; i < opts.window_size; i++) { ret = post_one_tx(&tx_ctx_arr[i]); if (ret) - return ret; + goto out; } ret = ft_get_tx_comp(tx_seq); if (ret) - return ret; + goto out; } else { for (i = 0; i < opts.window_size; i++) { ret = ft_post_rx_buf(ep, opts.transfer_size, @@ -127,66 +196,39 @@ static int run_loop(void) rx_ctx_arr[i].buf, rx_ctx_arr[i].desc, 0); if (ret) - return ret; + goto out; } ret = wait_check_rx_bufs(); if (ret) - return ret; + goto out; } - ft_stop(); if (opts.options & FT_OPT_OOB_SYNC) ret = ft_sync(); else ret = post_rx_sync(); - if (ret) - return ret; - - if (opts.machr) - show_perf_mr(opts.transfer_size, opts.window_size, &start, &end, 1, - opts.argc, opts.argv); - else - show_perf(NULL, opts.transfer_size, opts.window_size, &start, &end, 1); - +out: + printf("%s\n", ret ? "Fail" : "Pass"); return ret; } static int run(void) { - int ret, i; + int ret; ret = hints->ep_attr->type == FI_EP_MSG ? ft_init_fabric_cm() : ft_init_fabric(); if (ret) return ret; - - ret = ft_tx(ep, remote_fi_addr, 1, &tx_ctx); - if (ret) - return ret; - ret = ft_get_tx_comp(tx_seq); + ret = run_batch_mr_send(); if (ret) - return ret; + goto out; - ret = ft_get_rx_comp(rx_seq); + ret = run_seq_mr_send(); if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - ret = run_loop(); - if (ret) - goto out; - } - } else { - ret = run_loop(); - if (ret) - goto out; - } + goto out; out: return ret; @@ -197,6 +239,8 @@ int main(int argc, char **argv) int op, ret; opts = INIT_OPTS; + opts.options |= FT_OPT_ALLOC_MULT_MR; + opts.options |= FT_OPT_NO_PRE_POSTED_RX; hints = fi_allocinfo(); if (!hints) @@ -225,7 +269,7 @@ int main(int argc, char **argv) break; case '?': case 'h': - ft_usage(argv[0], "A bandwidth test with data verification."); + ft_usage(argv[0], "test to oversubscribe mr cache and receiver with unexpected msgs."); FT_PRINT_OPTS_USAGE("-T sleep_time", "Receive side delay before starting"); FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); @@ -239,12 +283,10 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; - opts.options |= FT_OPT_ALLOC_MULT_MR; - if (hints->ep_attr->type == FI_EP_DGRAM) { fprintf(stderr, "This test does not support DGRAM endpoints\n"); return -FI_EINVAL; @@ -260,4 +302,4 @@ int main(int argc, char **argv) ft_free_res(); return ft_exit_code(ret); -} +} \ No newline at end of file diff --git a/fabtests/functional/inject_test.c b/fabtests/functional/inject_test.c index 057682f7c4d..a826257153d 100644 --- a/fabtests/functional/inject_test.c +++ b/fabtests/functional/inject_test.c @@ -86,7 +86,7 @@ static int send_msg(int sendmsg, size_t size) static int receive_msg(size_t size) { int ret; - struct fi_context inj_ctx; + struct fi_context2 inj_ctx; ft_tag = 0xabcd; ret = ft_post_rx(ep, size, &inj_ctx); @@ -194,7 +194,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->caps = FI_TAGGED; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; diff --git a/fabtests/functional/loopback.c b/fabtests/functional/loopback.c index f66e0604bcb..5031d71327e 100644 --- a/fabtests/functional/loopback.c +++ b/fabtests/functional/loopback.c @@ -90,7 +90,7 @@ int main(int argc, char **argv) opts.src_addr = "127.0.0.1"; hints->caps = FI_LOCAL_COMM | FI_MSG | FI_TAGGED; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; while ((op = getopt(argc, argv, "h" INFO_OPTS)) != -1) { switch (op) { diff --git a/fabtests/functional/mcast.c b/fabtests/functional/mcast.c index 7a486c868b0..64c95ae9308 100644 --- a/fabtests/functional/mcast.c +++ b/fabtests/functional/mcast.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_DGRAM; hints->caps = FI_MSG | FI_MULTICAST; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index 4e9b25edc76..836df9450e0 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -46,18 +46,18 @@ #include #include "shared.h" +#include "hmem.h" static struct fid_ep **eps; -static char *data_bufs; -static char **send_bufs; -static char **recv_bufs; -static struct fi_context *recv_ctx; -static struct fi_context *send_ctx; +static char **send_bufs, **recv_bufs; +static struct fid_mr **send_mrs, **recv_mrs; +static void **send_descs, **recv_descs; +static struct fi_rma_iov *peer_iovs; +static struct fi_context2 *recv_ctx; +static struct fi_context2 *send_ctx; static struct fid_cq **txcqs, **rxcqs; static struct fid_av **avs; -static struct fid_mr *data_mr = NULL; -static void *data_desc = NULL; -static fi_addr_t *remote_addr; +static fi_addr_t *remote_fiaddr; static bool shared_cq = false; static bool shared_av = false; int num_eps = 3; @@ -71,9 +71,16 @@ static void free_ep_res() { int i; - FT_CLOSE_FID(data_mr); for (i = 0; i < num_eps; i++) { + if (fi->domain_attr->mr_mode & FI_MR_RAW) + (void) fi_mr_unmap_key(domain, peer_iovs[i].key); + + FT_CLOSE_FID(send_mrs[i]); + FT_CLOSE_FID(recv_mrs[i]); FT_CLOSE_FID(eps[i]); + + (void) ft_hmem_free(opts.iface, (void *) send_bufs[i]); + (void) ft_hmem_free(opts.iface, (void *) recv_bufs[i]); } for (i = 0; i < num_eps; i++) { @@ -84,48 +91,79 @@ static void free_ep_res() free(txcqs); free(rxcqs); - free(data_bufs); free(send_bufs); free(recv_bufs); + free(send_mrs); + free(recv_mrs); + free(peer_iovs); + free(send_descs); + free(recv_descs); free(send_ctx); free(recv_ctx); - free(remote_addr); + free(remote_fiaddr); free(eps); free(avs); } +static int reg_mrs(void) +{ + int i, ret; + + for (i = 0; i < num_eps; i++) { + ret = ft_reg_mr(fi, send_bufs[i], opts.transfer_size, + ft_info_to_mr_access(fi), + (FT_MR_KEY + 1) * (i + 1), opts.iface, + opts.device, &send_mrs[i], &send_descs[i]); + if (ret) + return ret; + + ret = ft_reg_mr(fi, recv_bufs[i], opts.transfer_size, + ft_info_to_mr_access(fi), + (FT_MR_KEY + 2) * (i + 2), opts.iface, + opts.device, &recv_mrs[i], &recv_descs[i]); + if (ret) + return ret; + } + + return FI_SUCCESS; +} + static int alloc_multi_ep_res() { - char *rx_buf_ptr; int i, ret; eps = calloc(num_eps, sizeof(*eps)); - remote_addr = calloc(num_eps, sizeof(*remote_addr)); - send_bufs = calloc(num_eps, sizeof(*send_bufs)); - recv_bufs = calloc(num_eps, sizeof(*recv_bufs)); + remote_fiaddr = calloc(num_eps, sizeof(*remote_fiaddr)); + send_mrs = calloc(num_eps, sizeof(*send_mrs)); + recv_mrs = calloc(num_eps, sizeof(*recv_mrs)); + send_descs = calloc(num_eps, sizeof(*send_descs)); + recv_descs = calloc(num_eps, sizeof(*recv_descs)); + peer_iovs = calloc(num_eps, sizeof(*peer_iovs)); send_ctx = calloc(num_eps, sizeof(*send_ctx)); recv_ctx = calloc(num_eps, sizeof(*recv_ctx)); - data_bufs = calloc(num_eps * 2, opts.transfer_size); + send_bufs = calloc(num_eps, opts.transfer_size); + recv_bufs = calloc(num_eps, opts.transfer_size); + txcqs = calloc(num_eps, sizeof(*txcqs)); rxcqs = calloc(num_eps, sizeof(*rxcqs)); avs = calloc(num_eps, sizeof(*avs)); - if (!eps || !remote_addr || !send_bufs || !recv_bufs || - !send_ctx || !recv_ctx || !data_bufs || !txcqs || !rxcqs) + if (!eps || !remote_fiaddr || !send_bufs || !recv_bufs || + !send_ctx || !recv_ctx || !send_bufs || !recv_bufs || + !send_mrs || !recv_mrs || !send_descs || !recv_descs || + !txcqs || !rxcqs || !peer_iovs) return -FI_ENOMEM; - rx_buf_ptr = data_bufs + opts.transfer_size * num_eps; for (i = 0; i < num_eps; i++) { - send_bufs[i] = data_bufs + opts.transfer_size * i; - recv_bufs[i] = rx_buf_ptr + opts.transfer_size * i; - } + ret = ft_hmem_alloc(opts.iface, opts.device, + (void **) &send_bufs[i], opts.transfer_size); + if (ret) + return ret; - ret = ft_reg_mr(fi, data_bufs, num_eps * 2 * opts.transfer_size, - ft_info_to_mr_access(fi), FT_MR_KEY + 1, opts.iface, - opts.device, &data_mr, &data_desc); - if (ret) { - free_ep_res(); - return ret; + ret = ft_hmem_alloc(opts.iface, opts.device, + (void **) &recv_bufs[i], opts.transfer_size); + if (ret) + return ret; } return 0; @@ -140,7 +178,8 @@ static int ep_post_rx(int idx) do { ret = fi_recv(eps[idx], recv_bufs[idx], opts.transfer_size, - data_desc, FI_ADDR_UNSPEC, &recv_ctx[idx]); + recv_descs[idx], FI_ADDR_UNSPEC, + &recv_ctx[idx]); if (ret == -FI_EAGAIN) (void) fi_cq_read(rxcqs[cq_read_idx], NULL, 0); @@ -149,22 +188,37 @@ static int ep_post_rx(int idx) return ret; } -static int ep_post_tx(int idx) +static int ep_post_tx(int idx, size_t len) { int ret, cq_read_idx = idx; if (shared_cq) cq_read_idx = 0; - if (ft_check_opts(FT_OPT_VERIFY_DATA)) { - ret = ft_fill_buf(send_bufs[idx], opts.transfer_size); - if (ret) - return ret; - } + do { + ret = fi_send(eps[idx], send_bufs[idx], len, + send_descs[idx], remote_fiaddr[idx], + &send_ctx[idx]); + if (ret == -FI_EAGAIN) + (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); + + } while (ret == -FI_EAGAIN); + + return ret; +} + +static int ep_post_write(int idx) +{ + int ret, cq_read_idx = idx; + + if (shared_cq) + cq_read_idx = 0; do { - ret = fi_send(eps[idx], send_bufs[idx], opts.transfer_size, - data_desc, remote_addr[idx], &send_ctx[idx]); + ret = fi_write(eps[idx], send_bufs[idx], opts.transfer_size, + send_descs[idx], remote_fiaddr[idx], + peer_iovs[idx].addr, peer_iovs[idx].key, + &send_ctx[idx]); if (ret == -FI_EAGAIN) (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); @@ -173,10 +227,34 @@ static int ep_post_tx(int idx) return ret; } -static int do_transfers(void) +static int get_one_comp(struct fid_cq *cq) +{ + struct fi_cq_err_entry comp; + int ret, i; + + do { + ret = fi_cq_read(cq, &comp, 1); + if (ret > 0) + break; + + if (ret < 0 && ret != -FI_EAGAIN) + return ret; + + if (!shared_cq) { + /* Drive progress on all EPs in case peer is waiting on + * different EP pair + */ + for (i = 0; i < num_eps; i++) + (void) fi_cq_read(rxcqs[i], NULL, 0); + } + } while (1); + + return FI_SUCCESS; +} + +static int sync_all(void) { int i, ret, cq_read_idx; - uint64_t cur; for (i = 0; i < num_eps; i++) { ret = ep_post_rx(i); @@ -184,34 +262,120 @@ static int do_transfers(void) FT_PRINTERR("fi_recv", ret); return ret; } + + ret = ep_post_tx(i, 0); + if (ret) { + FT_PRINTERR("fi_send", ret); + return ret; + } + + cq_read_idx = shared_cq ? 0 : i; + + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) + return ret; + + ret = get_one_comp(rxcqs[cq_read_idx]); + if (ret) + return ret; } + return FI_SUCCESS; +} + +static int do_sends(void) +{ + char temp[FT_MAX_CTRL_MSG]; + struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; + int i, ret, cq_read_idx; + size_t key_size, len; - printf("Send to all %d remote EPs\n", num_eps); for (i = 0; i < num_eps; i++) { - ret = ep_post_tx(i); + ret = ep_post_rx(i); + if (ret) { + FT_PRINTERR("fi_recv", ret); + return ret; + } + } + + memset(peer_iovs, 0, sizeof(*peer_iovs) * num_eps); + + printf("Send RMA info to all %d remote EPs\n", num_eps); + for (i = 0; i < num_eps; i++) { + len = opts.transfer_size; + ret = ft_fill_rma_info(recv_mrs[i], recv_bufs[i], rma_iov, + &key_size, &len); + if (ret) + return ret; + + ret = ft_hmem_copy_to(opts.iface, opts.device, send_bufs[i], + rma_iov, len); + if (ret) + return ret; + + ret = ep_post_tx(i, len); if (ret) { FT_PRINTERR("fi_send", ret); return ret; } + + cq_read_idx = shared_cq ? 0 : i; + + ret = get_one_comp(rxcqs[cq_read_idx]); + if (ret) + return ret; + + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) + return ret; } printf("Wait for all messages from peer\n"); for (i = 0; i < num_eps; i++) { - if (shared_cq) - cq_read_idx = 0; - else - cq_read_idx = i; - cur = 0; - ret = ft_get_cq_comp(txcqs[cq_read_idx], &cur, 1, -1); - if (ret < 0) + ret = ft_hmem_copy_from(opts.iface, opts.device, rma_iov, + recv_bufs[i], len); + if (ret) + return ret; + + ret = ft_get_rma_info(rma_iov, &peer_iovs[i], key_size); + if (ret) + return ret; + } + + ret = sync_all(); + if (ret) + return ret; + + printf("PASSED multi ep sends\n"); + return 0; +} + +static int do_rma(void) +{ + int i, ret, cq_read_idx; + + for (i = 0; i < num_eps; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = ft_fill_buf(send_bufs[i], opts.transfer_size); + if (ret) + return ret; + } + ret = ep_post_write(i); + if (ret) return ret; + } - cur = 0; - ret = ft_get_cq_comp(rxcqs[cq_read_idx], &cur, 1, -1); - if (ret < 0) + printf("Wait for all writes from peer\n"); + for (i = 0; i < num_eps; i++) { + cq_read_idx = shared_cq ? 0 : i; + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) return ret; } + ret = sync_all(); + if (ret) + return ret; + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { for (i = 0; i < num_eps; i++) { ret = ft_check_buf(recv_bufs[i], opts.transfer_size); @@ -221,11 +385,7 @@ static int do_transfers(void) printf("Data check OK\n"); } - ret = ft_finalize_ep(ep); - if (ret) - return ret; - - printf("PASSED multi ep\n"); + printf("PASSED multi ep writes\n"); return 0; } @@ -347,7 +507,7 @@ static int enable_ep(int idx) if (ret) return ret; - ret = ft_init_av_addr(avs[av_bind_idx], eps[idx], &remote_addr[idx]); + ret = ft_init_av_addr(avs[av_bind_idx], eps[idx], &remote_fiaddr[idx]); if (ret) return ret; @@ -393,6 +553,10 @@ static int run_test(void) } } + ret = reg_mrs(); + if (ret) + goto out; + for (i = 0; i < num_eps; i++) { if (hints->ep_attr->type != FI_EP_MSG) { ret = enable_ep(i); @@ -401,8 +565,39 @@ static int run_test(void) } } - ret = do_transfers(); + ret = do_sends(); + if (ret) + goto out; + ret = do_rma(); + if (ret) + goto out; + + printf("Testing closing and re-registering all MRs and retesting\n"); + for (i = 0; i < num_eps; i++) { + if (fi->domain_attr->mr_mode & FI_MR_RAW) { + ret = fi_mr_unmap_key(domain, peer_iovs[i].key); + if (ret) + goto out; + } + + FT_CLOSE_FID(send_mrs[i]); + FT_CLOSE_FID(recv_mrs[i]); + } + + ret = reg_mrs(); + if (ret) + goto out; + + ret = do_sends(); + if (ret) + goto out; + + ret = do_rma(); + if (ret) + goto out; + + ret = ft_finalize_ep(ep); out: free_ep_res(); return ret; @@ -421,17 +616,12 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - int lopt_idx = 0; - struct option long_opts[] = { - {"shared-av", no_argument, NULL, LONG_OPT_SHARED_AV}, - {"shared-cq", no_argument, NULL, LONG_OPT_SHARED_CQ}, - {0, 0, 0, 0} - }; - - while ((op = getopt_long(argc, argv, "c:vh" ADDR_OPTS INFO_OPTS, + while ((op = getopt_long(argc, argv, "c:vhAQ" ADDR_OPTS INFO_OPTS, long_opts, &lopt_idx)) != -1) { switch (op) { default: + if (!ft_parse_long_opts(op, optarg)) + continue; ft_parse_addr_opts(op, optarg, &opts); ft_parseinfo(op, optarg, hints, &opts); break; @@ -441,10 +631,10 @@ int main(int argc, char **argv) case 'v': opts.options |= FT_OPT_VERIFY_DATA; break; - case LONG_OPT_SHARED_AV: + case 'A': shared_av = true; break; - case LONG_OPT_SHARED_CQ: + case 'Q': shared_cq = true; break; case '?': @@ -453,10 +643,10 @@ int main(int argc, char **argv) FT_PRINT_OPTS_USAGE("-c ", "number of endpoints to create and test (def 3)"); FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); - FT_PRINT_OPTS_USAGE("--shared-cq", + FT_PRINT_OPTS_USAGE("-Q", "Share tx/rx cq among endpoints. \n" "By default each ep has its own tx/rx cq"); - FT_PRINT_OPTS_USAGE("--shared-av", + FT_PRINT_OPTS_USAGE("-A", "Share the av among endpoints. \n" "By default each ep has its own av"); return EXIT_FAILURE; @@ -466,8 +656,8 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->caps = FI_MSG | FI_RMA; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_mr.c b/fabtests/functional/multi_mr.c index bc0e5521319..fefebc4fabc 100644 --- a/fabtests/functional/multi_mr.c +++ b/fabtests/functional/multi_mr.c @@ -178,7 +178,7 @@ static int init_multi_mr_res(void) static int mr_key_test() { int i, ret = 0; - struct fi_context rma_ctx; + struct fi_context2 rma_ctx; for (i = 0; i < mr_count; i++) { tx_buf = (char *)mr_res_array[i].buf; @@ -319,7 +319,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_RMA | FI_RMA_EVENT | FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_recv.c b/fabtests/functional/multi_recv.c index 8b698e90f5d..672bb15c727 100644 --- a/fabtests/functional/multi_recv.c +++ b/fabtests/functional/multi_recv.c @@ -41,7 +41,7 @@ #define MAX_XFER_SIZE (1 << 20) static struct fid_mr *mr_multi_recv; -struct fi_context ctx_multi_recv[2]; +struct fi_context2 ctx_multi_recv[2]; static int use_recvmsg, comp_per_buf; @@ -263,11 +263,6 @@ static int run(void) if (ret) return ret; - ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, - &tx_size, sizeof(tx_size)); - if (ret) - return ret; - ret = post_multi_recv_buffer(); if (ret) return ret; @@ -327,8 +322,9 @@ int main(int argc, char **argv) return EIO; } + opts.min_multi_recv_size = opts.transfer_size; hints->caps = FI_MSG | FI_MULTI_RECV; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->rx_attr->op_flags = FI_MULTI_RECV; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/poll.c b/fabtests/functional/poll.c deleted file mode 100644 index f9a2079c94f..00000000000 --- a/fabtests/functional/poll.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include - -#define MAX_POLL_CNT 10 - -static int alloc_ep_res(struct fi_info *fi) -{ - struct fi_poll_attr poll_attr; - int ret; - - ret = ft_alloc_active_res(fi); - if (ret) - return ret; - - memset(&poll_attr, 0, sizeof poll_attr); - ret = fi_poll_open(domain, &poll_attr, &pollset); - if (ret) { - FT_PRINTERR("fi_poll_open", ret); - return ret; - } - - if (txcq) { - ret = fi_poll_add(pollset, &txcq->fid, 0); - if (ret) - goto err; - } - - if (rxcq) { - ret = fi_poll_add(pollset, &rxcq->fid, 0); - if (ret) - goto err; - } - - if (txcntr) { - ret = fi_poll_add(pollset, &txcntr->fid, 0); - if (ret) - goto err; - } - - if (rxcntr) { - ret = fi_poll_add(pollset, &rxcntr->fid, 0); - if (ret) - goto err; - } - - return 0; -err: - FT_PRINTERR("fi_poll_add", ret); - return ret; -} - -static int free_poll_res(void) -{ - int ret; - - if (!pollset) - return 0; - - if (txcq) { - ret = fi_poll_del(pollset, &txcq->fid, 0); - if (ret) - goto err; - } - - if (rxcq) { - ret = fi_poll_del(pollset, &rxcq->fid, 0); - if (ret) - goto err; - } - - if (txcntr) { - ret = fi_poll_del(pollset, &txcntr->fid, 0); - if (ret) - goto err; - } - - if (rxcntr) { - ret = fi_poll_del(pollset, &rxcntr->fid, 0); - if (ret) - goto err; - } - return 0; -err: - FT_PRINTERR("fi_poll_del", ret); - return ret; -} - -static int init_fabric(void) -{ - int ret; - - ret = ft_init(); - if (ret) - return ret; - - ret = ft_init_oob(); - if (ret) - return ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - ret = ft_open_fabric_res(); - if (ret) - return ret; - - ret = alloc_ep_res(fi); - if (ret) - return ret; - - ret = ft_enable_ep_recv(); - if (ret) - return ret; - return 0; -} - -static int send_recv() -{ - struct fid_cq *cq; - void *context[MAX_POLL_CNT]; - struct fi_cq_entry comp; - int ret; - int ret_count = 0; - int i, tx_cntr_val = 0, rx_cntr_val = 0; - - fprintf(stdout, "Posting a send...\n"); - ret = ft_post_tx(ep, remote_fi_addr, tx_size, NO_CQ_DATA, &tx_ctx); - if (ret) - return ret; - - while (((opts.options & FT_OPT_TX_CQ) && (tx_cq_cntr < tx_seq)) || - ((opts.options & FT_OPT_TX_CNTR) && (tx_cntr_val < tx_seq)) || - ((opts.options & FT_OPT_RX_CQ) && (rx_cq_cntr < rx_seq)) || - ((opts.options & FT_OPT_RX_CNTR) && (rx_cntr_val < rx_seq))) { - - /* Poll send and recv CQs/Cntrs */ - do { - ret_count = fi_poll(pollset, context, MAX_POLL_CNT); - if (ret_count < 0) { - FT_PRINTERR("fi_poll", ret_count); - return ret_count; - } - } while (!ret_count); - - fprintf(stdout, "Retrieved %d event(s)\n", ret_count); - - for (i = 0; i < ret_count; i++) { - if (context[i] == &txcq) { - printf("Send completion received\n"); - cq = txcq; - tx_cq_cntr++; - } else if (context[i] == &rxcq) { - printf("Recv completion received\n"); - cq = rxcq; - rx_cq_cntr++; - } else if (context[i] == &txcntr) { - printf("Send counter poll-event\n"); - tx_cntr_val = fi_cntr_read(txcntr); - if (tx_cntr_val > tx_seq) { - FT_ERR("Invalid tx counter event\n"); - FT_ERR("expected: %" PRIu64 ", found: " - "%d\n", tx_seq, tx_cntr_val); - return -1; - } - continue; - } else if (context[i] == &rxcntr) { - printf("Recv counter poll-event\n"); - rx_cntr_val = fi_cntr_read(rxcntr); - if (rx_cntr_val > rx_seq) { - FT_ERR("Invalid rx counter event\n"); - FT_ERR("expected: %" PRIu64 ", found: " - "%d\n", rx_seq, rx_cntr_val); - return -1; - } - continue; - } else { - FT_ERR("Unknown completion received\n"); - return -1; - } - - /* Read the completion entry */ - ret = fi_cq_read(cq, &comp, 1); - if (ret < 0) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(cq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - } - } - - return 0; -} - -static int run(void) -{ - int ret; - - ret = init_fabric(); - if (ret) - return ret; - - ret = ft_init_av(); - if (ret) - return ret; - - return send_recv(); -} - -int main(int argc, char **argv) -{ - int op, ret = 0; - - opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - ft_parsecsopts(op, optarg, &opts); - break; - case '?': - case 'h': - ft_usage(argv[0], "A client-server example that uses poll.\n"); - FT_PRINT_OPTS_USAGE("-t ", "completion type [queue, counter]"); - return EXIT_FAILURE; - } - } - - if (optind < argc) - opts.dst_addr = argv[optind]; - - hints->ep_attr->type = FI_EP_RDM; - hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; - hints->domain_attr->mr_mode = opts.mr_mode; - hints->addr_format = opts.address_format; - - ret = run(); - - free_poll_res(); - ft_free_res(); - return ft_exit_code(ret); -} diff --git a/fabtests/functional/rdm.c b/fabtests/functional/rdm.c index a887b70d418..666ca13c671 100644 --- a/fabtests/functional/rdm.c +++ b/fabtests/functional/rdm.c @@ -94,7 +94,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_atomic.c b/fabtests/functional/rdm_atomic.c index 638b9e1148b..915dba884d1 100644 --- a/fabtests/functional/rdm_atomic.c +++ b/fabtests/functional/rdm_atomic.c @@ -40,13 +40,13 @@ static enum fi_op op_type = FI_MIN; static void *result; static void *compare; +static void *cpy_dst; static struct fid_mr *mr_result; static struct fid_mr *mr_compare; -static struct fi_context fi_ctx_atomic; +static struct fi_context2 fi_ctx_atomic; static enum fi_datatype datatype; -static size_t *count; static int run_all_ops = 1, run_all_datatypes = 1; static enum fi_op get_fi_op(char *op) @@ -148,92 +148,15 @@ static void print_opts_usage(char *name) FT_PRINT_OPTS_USAGE("", "int32|uint32|int64|uint64|int128|uint128|" "float|double|float_complex|double_complex|"); FT_PRINT_OPTS_USAGE("", "long_double|long_double_complex (default: all)"); + FT_PRINT_OPTS_USAGE("-v", "enables data_integrity checks"); } -#define create_atomic_op_executor(type) \ -static inline int execute_atomic_ ## type ## _op(enum fi_op op_type, \ - enum fi_datatype datatype) \ -{ \ - int ret = FI_SUCCESS, len, i; \ - len = snprintf((test_name), sizeof(test_name), "%s_", \ - fi_tostr(&(datatype), FI_TYPE_ATOMIC_TYPE)); \ - snprintf((test_name) + len, sizeof(test_name) - len, "%s_"#type"_lat", \ - fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); \ - opts.transfer_size = datatype_to_size(datatype); \ - \ - ft_start(); \ - for (i = 0; i < opts.iterations; i++) { \ - ret = execute_ ## type ## _atomic_op(op_type); \ - if (ret) \ - break; \ - } \ - ft_stop(); \ - report_perf(); \ - \ - return ret; \ -} - -#define create_atomic_op_handler(type) \ -create_atomic_op_executor(type) \ -static inline int handle_atomic_ ## type ## _op(int run_all_datatypes, \ - enum fi_op op_type, \ - size_t *count) \ -{ \ - int ret = FI_SUCCESS; \ - \ - if (run_all_datatypes) { \ - for (datatype = 0; datatype < OFI_DATATYPE_CNT; datatype++) { \ - ret = check_ ## type ## _atomic_op(ep, op_type, \ - datatype, count); \ - if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { \ - fprintf(stderr, \ - "Provider doesn't support %s ", \ - fi_tostr(&op_type, \ - FI_TYPE_ATOMIC_OP)); \ - fprintf(stderr, \ - #type" atomic operation on %s\n", \ - fi_tostr(&datatype, \ - FI_TYPE_ATOMIC_TYPE)); \ - continue; \ - } else if (ret) { \ - goto fn; \ - } \ - \ - ret = execute_atomic_ ##type ## _op(op_type, datatype); \ - if (ret) \ - goto fn; \ - } \ - } else { \ - ret = check_ ## type ## _atomic_op(ep, op_type, \ - datatype, count); \ - if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { \ - fprintf(stderr, \ - "Provider doesn't support %s ", \ - fi_tostr(&op_type, \ - FI_TYPE_ATOMIC_OP)); \ - fprintf(stderr, \ - #type" atomic operation on %s\n", \ - fi_tostr(&datatype, \ - FI_TYPE_ATOMIC_TYPE)); \ - goto fn; \ - } else if (ret) { \ - goto fn; \ - } \ - \ - ret = execute_atomic_ ## type ##_op(op_type, datatype); \ - } \ - \ -fn: \ - return ret; \ -} - - -static inline int execute_base_atomic_op(enum fi_op op) +static inline int execute_base_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_BASE, ep, NULL, NULL, NULL, NULL, - &remote, datatype, op, &fi_ctx_atomic); + &remote, datatype, op_type, &fi_ctx_atomic); if (ret) return ret; @@ -242,13 +165,13 @@ static inline int execute_base_atomic_op(enum fi_op op) return ret; } -static inline int execute_fetch_atomic_op(enum fi_op op) +static inline int execute_fetch_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_FETCH, ep, NULL, NULL, result, fi_mr_desc(mr_result), &remote, datatype, - op, &fi_ctx_atomic); + op_type, &fi_ctx_atomic); if (ret) return ret; @@ -257,13 +180,13 @@ static inline int execute_fetch_atomic_op(enum fi_op op) return ret; } -static inline int execute_compare_atomic_op(enum fi_op op) +static inline int execute_compare_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_COMPARE, ep, compare, fi_mr_desc(mr_compare), result, fi_mr_desc(mr_result), &remote, datatype, - op, &fi_ctx_atomic); + op_type, &fi_ctx_atomic); if (ret) return ret; @@ -272,8 +195,44 @@ static inline int execute_compare_atomic_op(enum fi_op op) return ret; } +static int fill_data(enum ft_atomic_opcodes opcode) +{ + int ret; + + switch (opcode) { + case FT_ATOMIC_COMPARE: + ft_fill_atomic(compare, 1, datatype); + /* fall through */ + case FT_ATOMIC_FETCH: + ft_hmem_memset(opts.iface, opts.device, result, 0, + datatype_to_size(datatype)); + /* fall through */ + case FT_ATOMIC_BASE: + ft_fill_atomic(tx_buf, 1, datatype); + ft_fill_atomic(rx_buf, 1, datatype); + break; + default: + break; + } + + ret = ft_hmem_copy_from(opts.iface, opts.device, cpy_dst, + rx_buf, datatype_to_size(datatype)); + if (ret) + return ret; + + ft_sync(); + return ret; +} + static void report_perf(void) { + int len; + + len = snprintf((test_name), sizeof(test_name), "%s_", + fi_tostr(&(datatype), FI_TYPE_ATOMIC_TYPE)); + snprintf((test_name) + len, sizeof(test_name) - len, "%s_lat", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + if (opts.machr) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1, opts.argc, opts.argv); @@ -281,21 +240,117 @@ static void report_perf(void) show_perf(test_name, opts.transfer_size, opts.iterations, &start, &end, 1); } -create_atomic_op_handler(base) -create_atomic_op_handler(fetch) -create_atomic_op_handler(compare) +static int handle_atomic_base_op(void) +{ + int ret = FI_SUCCESS, i; + size_t count = 0; -static int run_op(void) + ret = check_base_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; + + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_BASE); + if (ret) + return ret; + } + + ret = execute_base_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_BASE, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } + } + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int handle_atomic_fetch_op(void) { - int ret = -FI_EINVAL; + int ret = FI_SUCCESS, i; + size_t count = 0; + + ret = check_fetch_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; - count = (size_t *)malloc(sizeof(*count)); - if (!count) { - ret = -FI_ENOMEM; - perror("malloc"); - goto fn; + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_FETCH); + if (ret) + return ret; + } + + ret = execute_fetch_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_FETCH, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } } - ft_sync(); + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int handle_atomic_compare_op(void) +{ + int ret = FI_SUCCESS, i; + size_t count = 0; + + ret = check_compare_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; + + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_COMPARE); + if (ret) + return ret; + } + + ret = execute_compare_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_COMPARE, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } + } + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int run_dt(void) +{ + int ret = -FI_EINVAL; switch (op_type) { case FI_MIN: @@ -309,12 +364,10 @@ static int run_op(void) case FI_LXOR: case FI_BXOR: case FI_ATOMIC_WRITE: - ret = handle_atomic_base_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_base_op(); break; case FI_ATOMIC_READ: - ret = handle_atomic_fetch_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_fetch_op(); break; case FI_CSWAP: case FI_CSWAP_NE: @@ -323,39 +376,64 @@ static int run_op(void) case FI_CSWAP_GE: case FI_CSWAP_GT: case FI_MSWAP: - ret = handle_atomic_compare_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_compare_op(); break; default: FT_WARN("Invalid atomic operation type %d\n", op_type); break; } - ft_sync(); - free(count); -fn: + + if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { + fprintf(stderr, "Provider doesn't support %s ", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + fprintf(stderr, "atomic operation on %s\n", + fi_tostr(&datatype, FI_TYPE_ATOMIC_TYPE)); + return FI_SUCCESS; + } + if (ret) { + fprintf(stderr, "Failed atomic op %s ", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + fprintf(stderr, "with datatype %s\n", + fi_tostr(&datatype, FI_TYPE_ATOMIC_TYPE)); + } return ret; } -static int run_ops(void) +static int run_op(void) { int ret; - for (op_type = FI_MIN; op_type < OFI_ATOMIC_OP_CNT; op_type++) { - ret = run_op(); + if (!run_all_datatypes) + return run_dt(); + + for (datatype = 0; datatype < OFI_DATATYPE_CNT; datatype++) { + ret = run_dt(); if (ret && ret != -FI_ENOSYS && ret != -FI_EOPNOTSUPP) { FT_PRINTERR("run_op", ret); return ret; } } - - return 0; + return FI_SUCCESS; } static int run_test(void) { - return run_all_ops ? run_ops() : run_op(); + int ret; + + if (!run_all_ops) + return run_op(); + + for (op_type = FI_MIN; op_type < OFI_ATOMIC_OP_CNT; op_type++) { + ret = run_op(); + if (ret && ret != -FI_ENOSYS && ret != -FI_EOPNOTSUPP) { + FT_PRINTERR("run_op", ret); + return ret; + } + } + + return FI_SUCCESS; } static void free_res(void) @@ -370,15 +448,17 @@ static void free_res(void) ft_hmem_free(opts.iface, compare); compare = NULL; } + if (cpy_dst) { + ft_hmem_free_host(opts.iface, cpy_dst); + cpy_dst = NULL; + } } static uint64_t get_mr_key() { static uint64_t user_key = FT_MR_KEY + 1; - return ((fi->domain_attr->mr_mode == FI_MR_BASIC) || - (fi->domain_attr->mr_mode & FI_MR_PROV_KEY)) ? - 0 : user_key++; + return fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : user_key++; } static int alloc_ep_res(struct fi_info *fi) @@ -398,6 +478,10 @@ static int alloc_ep_res(struct fi_info *fi) return -1; } + ret = ft_hmem_alloc_host(opts.iface, &cpy_dst, opts.transfer_size); + if (ret) + return ret; + // registers local data buffer that stores results ret = ft_reg_mr(fi, result, buf_size, (mr_local ? FI_READ : 0) | FI_REMOTE_WRITE, @@ -455,7 +539,7 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt_long(argc, argv, "ho:Uz:" CS_OPTS INFO_OPTS, + while ((op = getopt_long(argc, argv, "ho:Uz:v" CS_OPTS INFO_OPTS, long_opts, &lopt_idx)) != -1) { switch (op) { case 'o': @@ -485,6 +569,9 @@ int main(int argc, char **argv) } } break; + case 'v': + opts.options |= FT_OPT_VERIFY_DATA; + break; default: if (!ft_parse_long_opts(op, optarg)) continue; @@ -504,7 +591,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_ATOMICS; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; ret = run(); diff --git a/fabtests/functional/rdm_deferred_wq.c b/fabtests/functional/rdm_deferred_wq.c index 7526c709861..0f780bee56e 100644 --- a/fabtests/functional/rdm_deferred_wq.c +++ b/fabtests/functional/rdm_deferred_wq.c @@ -633,7 +633,7 @@ int main(int argc, char **argv) tested_op == FI_OP_COMPARE_ATOMIC) hints->caps |= FI_ATOMIC; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_multi_client.c b/fabtests/functional/rdm_multi_client.c index 332a19989f6..f3e5a5cd4dd 100644 --- a/fabtests/functional/rdm_multi_client.c +++ b/fabtests/functional/rdm_multi_client.c @@ -222,7 +222,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_multi_domain.c b/fabtests/functional/rdm_multi_domain.c index 71e0848eebf..14a8ebb440d 100644 --- a/fabtests/functional/rdm_multi_domain.c +++ b/fabtests/functional/rdm_multi_domain.c @@ -55,7 +55,7 @@ struct test_domain { struct fid_av *av; struct fid_mr *mr; struct fid_cq *tx_cq; - struct fi_context *rma_ctx; + struct fi_context2 *rma_ctx; }; struct test_domain *domain_res_array; @@ -274,7 +274,7 @@ static void free_domain_res() } static int write_data(void *buffer, size_t size, int dom_idx, - int remote_dom_idx, struct fi_context *rma_ctx) + int remote_dom_idx, struct fi_context2 *rma_ctx) { int ret = -FI_EAGAIN; @@ -427,7 +427,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_RMA | FI_RMA_EVENT | FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_rma_event.c b/fabtests/functional/rdm_rma_event.c index 8aaec557771..cb6786c2294 100644 --- a/fabtests/functional/rdm_rma_event.c +++ b/fabtests/functional/rdm_rma_event.c @@ -39,8 +39,8 @@ struct fi_rma_iov local; -struct fi_context fi_ctx_write; -struct fi_context fi_ctx_read; +struct fi_context2 fi_ctx_write; +struct fi_context2 fi_ctx_read; static int run_test(void) { @@ -126,7 +126,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_RMA | FI_RMA_EVENT; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_rma_trigger.c b/fabtests/functional/rdm_rma_trigger.c index d08191c7cdf..0fdba0c330c 100644 --- a/fabtests/functional/rdm_rma_trigger.c +++ b/fabtests/functional/rdm_rma_trigger.c @@ -154,7 +154,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_RMA | FI_RMA_EVENT | FI_TRIGGER; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_shared_av.c b/fabtests/functional/rdm_shared_av.c index b113f3354be..ce9d6b8c85b 100644 --- a/fabtests/functional/rdm_shared_av.c +++ b/fabtests/functional/rdm_shared_av.c @@ -189,7 +189,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_SHARED_AV; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_tagged_peek.c b/fabtests/functional/rdm_tagged_peek.c index c583d37013b..1cce508f3e3 100644 --- a/fabtests/functional/rdm_tagged_peek.c +++ b/fabtests/functional/rdm_tagged_peek.c @@ -42,7 +42,7 @@ #define BASE_TAG 0x900d #define SEND_CNT 10 -static struct fi_context fi_context; +static struct fi_context2 fi_context; static int wait_for_send_comp(int count) { @@ -355,7 +355,7 @@ int main(int argc, char **argv) hints->rx_attr->msg_order = FI_ORDER_SAS; hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_TAGGED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/recv_cancel.c b/fabtests/functional/recv_cancel.c index dc7cf3d072a..376cb6f6076 100644 --- a/fabtests/functional/recv_cancel.c +++ b/fabtests/functional/recv_cancel.c @@ -76,7 +76,7 @@ static int recv_cancel_host(void) int ret = 0; int retries = 0; struct fi_cq_err_entry recv_completion, cancel_error_entry; - struct fi_context cancel_recv_ctx, standard_recv_ctx; + struct fi_context2 cancel_recv_ctx, standard_recv_ctx; memset(&cancel_error_entry, 0, sizeof(cancel_error_entry)); @@ -200,6 +200,7 @@ static int recv_cancel_host(void) static int run_test(void) { int ret; + if (hints->ep_attr->type == FI_EP_MSG) ret = ft_init_fabric_cm(); else @@ -245,7 +246,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_TAGGED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/resmgmt_test.c b/fabtests/functional/resmgmt_test.c index ef2f3565e82..e27073e8e25 100644 --- a/fabtests/functional/resmgmt_test.c +++ b/fabtests/functional/resmgmt_test.c @@ -47,7 +47,7 @@ int delay, tagged; static int send_loop(size_t size) { int q_opts = 0; int ret; - struct fi_context send_ctx[max_opts]; + struct fi_context2 send_ctx[max_opts]; while (q_opts < max_opts) { do { @@ -91,7 +91,7 @@ static int receive_loop(size_t size) { int ret; int q_opts = 0; - struct fi_context recv_ctx[max_opts]; + struct fi_context2 recv_ctx[max_opts]; while (q_opts < max_opts) { do { @@ -262,7 +262,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/shared_ctx.c b/fabtests/functional/shared_ctx.c index 016a56e87fc..52b8711f75e 100644 --- a/fabtests/functional/shared_ctx.c +++ b/fabtests/functional/shared_ctx.c @@ -613,7 +613,7 @@ int main(int argc, char **argv) if (!(hints->caps & FI_TAGGED)) hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/unexpected_msg.c b/fabtests/functional/unexpected_msg.c index f01a029bdf1..70921913178 100644 --- a/fabtests/functional/unexpected_msg.c +++ b/fabtests/functional/unexpected_msg.c @@ -369,7 +369,7 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_TAGGED; diff --git a/fabtests/functional/unmap_mem.c b/fabtests/functional/unmap_mem.c index 5ba36581da4..01de49dd5a0 100644 --- a/fabtests/functional/unmap_mem.c +++ b/fabtests/functional/unmap_mem.c @@ -170,7 +170,7 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->caps = FI_MSG; hints->domain_attr->mr_mode = opts.mr_mode; diff --git a/fabtests/ubertest/ofi_atomic.h b/fabtests/include/ofi_atomic.h similarity index 71% rename from fabtests/ubertest/ofi_atomic.h rename to fabtests/include/ofi_atomic.h index a61a7bae432..abb87bc4620 100644 --- a/fabtests/ubertest/ofi_atomic.h +++ b/fabtests/include/ofi_atomic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,16 +33,13 @@ #ifndef _OFI_ATOMIC_H_ #define _OFI_ATOMIC_H_ -#include "fabtest.h" +#include "shared.h" +#include "ft_osd.h" #ifdef __cplusplus extern "C" { #endif -typedef long double long_double; -typedef float complex ofi_complex_float; -typedef double complex ofi_complex_double; -typedef long double complex ofi_complex_long_double; #define OFI_WRITE_OP_START FI_MIN #define OFI_WRITE_OP_LAST (FI_ATOMIC_WRITE + 1) @@ -83,42 +80,6 @@ extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT]) #define ofi_atomic_swap_op(op, datatype, dst, src, cmp, res, cnt) \ ofi_atomic_swap_handler(op, datatype)(dst, src, cmp, res, cnt) -#define OFI_DEF_COMPLEX_OPS(type) \ -static inline int ofi_complex_eq_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a == b; \ -} \ -static inline ofi_complex_## type ofi_complex_sum_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a + b; \ -} \ -static inline ofi_complex_## type ofi_complex_prod_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a * b; \ -} \ -static inline ofi_complex_## type ofi_complex_land_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a && b; \ -} \ -static inline ofi_complex_## type ofi_complex_lor_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a || b; \ -} \ -static inline int ofi_complex_lxor_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return (a && !b) || (!a && b); \ -} \ - -OFI_DEF_COMPLEX_OPS(float) -OFI_DEF_COMPLEX_OPS(double) -OFI_DEF_COMPLEX_OPS(long_double) - #ifdef __cplusplus } #endif diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index ae35106b5b8..b57a7dab0ed 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -43,11 +43,14 @@ #include #include #include +#include #include #include #include +#include "ofi_atomic.h" + #ifdef __cplusplus extern "C" { #endif @@ -69,6 +72,11 @@ extern "C" { ((type *) ((char *)ptr - offsetof(type, field))) #endif +/* + * Internal version of deprecated APIs. + * These are used internally to avoid compiler warnings. + */ +#define OFI_MR_DEPRECATED (0x3) /* FI_MR_BASIC | FI_MR_SCALABLE */ #define OFI_MR_BASIC_MAP (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) /* exit codes must be 0-255 */ @@ -140,6 +148,7 @@ enum { FT_OPT_DISABLE_TAG_VALIDATION = 1 << 25, FT_OPT_ADDR_IS_OOB = 1 << 26, FT_OPT_REG_DMABUF_MR = 1 << 27, + FT_OPT_NO_PRE_POSTED_RX = 1 << 28, FT_OPT_OOB_CTRL = FT_OPT_OOB_SYNC | FT_OPT_OOB_ADDR_EXCH, }; @@ -182,6 +191,7 @@ struct ft_opts { size_t transfer_size; size_t max_msg_size; size_t inject_size; + size_t min_multi_recv_size; int window_size; int av_size; int verbose; @@ -210,6 +220,7 @@ struct ft_opts { int force_prefix; enum fi_hmem_iface iface; uint64_t device; + enum fi_threading threading; char **argv; }; @@ -240,7 +251,7 @@ extern size_t buf_size, tx_size, rx_size, tx_mr_size, rx_mr_size; extern int tx_fd, rx_fd; extern int timeout; -extern struct fi_context tx_ctx, rx_ctx; +extern struct fi_context2 tx_ctx, rx_ctx; extern uint64_t remote_cq_data; extern uint64_t tx_seq, rx_seq, tx_cq_cntr, rx_cq_cntr; @@ -269,7 +280,11 @@ void ft_mcusage(char *name, char *desc); void ft_csusage(char *name, char *desc); int ft_fill_buf(void *buf, size_t size); +int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype); int ft_check_buf(void *buf, size_t size); +int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, + enum fi_datatype type, void *src, void *orig_dst, void *dst, + void *cmp, void *res, size_t count); int ft_check_opts(uint64_t flags); uint64_t ft_init_cq_data(struct fi_info *info); int ft_sock_listen(char *node, char *service); @@ -315,7 +330,8 @@ extern char default_port[8]; .iface = FI_HMEM_SYSTEM, \ .device = 0, \ .argc = argc, .argv = argv, \ - .address_format = FI_FORMAT_UNSPEC \ + .address_format = FI_FORMAT_UNSPEC, \ + .threading = FI_THREAD_DOMAIN \ } #define FT_STR_LEN 32 @@ -443,8 +459,8 @@ int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, struct fid_cntr **new_rma_cntr, struct fid_av **new_av); int ft_alloc_msgs(void); -int ft_alloc_host_tx_buf(size_t size); -void ft_free_host_tx_buf(void); +int ft_alloc_host_bufs(size_t size); +void ft_free_host_bufs(void); int ft_alloc_active_res(struct fi_info *fi); int ft_enable_ep_recv(void); int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av *bind_av, @@ -461,6 +477,11 @@ int ft_init_av_dst_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr, fi_addr_t *remote_addr); int ft_init_av_addr(struct fid_av *av, struct fid_ep *ep, fi_addr_t *addr); +int ft_fill_rma_info(struct fid_mr *mr, void *mr_buf, + struct fi_rma_iov *rma_iov, size_t *key_size, + size_t *rma_iov_len); +int ft_get_rma_info(struct fi_rma_iov *rma_iov, + struct fi_rma_iov *peer_iov, size_t key_size); int ft_exchange_keys(struct fi_rma_iov *peer_iov); void ft_fill_mr_attr(struct iovec *iov, struct fi_mr_dmabuf *dmabuf, int iov_count, uint64_t access, @@ -548,6 +569,8 @@ void *ft_get_aligned_addr(void *ptr, size_t alignment) int ft_read_cq(struct fid_cq *cq, uint64_t *cur, uint64_t total, int timeout, uint64_t tag); +int ft_sync_oob(void); +int ft_sync_inband(bool repost_rx); int ft_sync(void); int ft_sync_pair(int status); int ft_fork_and_pair(void); @@ -607,11 +630,9 @@ int ft_get_cq_comp(struct fid_cq *cq, uint64_t *cur, uint64_t total, int timeout int ft_get_cntr_comp(struct fid_cntr *cntr, uint64_t total, int timeout); int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, - size_t size, void *ctx, int flags); + void *buf, size_t size, void *ctx, uint64_t flags); int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, - void *buf, size_t size, void *ctx, int flags); -int ft_writemsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, - void *ctx, struct fi_rma_iov *remote, int flags); + void *buf, size_t size, void *ctx, uint64_t flags); int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void *ctx, uint64_t flags); int ft_cq_read_verify(struct fid_cq *cq, void *op_context); @@ -651,6 +672,7 @@ enum { LONG_OPT_CONTROL_PROGRESS, LONG_OPT_MAX_MSG_SIZE, LONG_OPT_USE_FI_MORE, + LONG_OPT_THREADING, }; extern int debug_assert; @@ -729,4 +751,100 @@ static inline void *ft_get_page_end(const void *addr, size_t page_size) + page_size, page_size) - 1); } +/* + * Common validation functions and variables + */ + +#define integ_alphabet "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define integ_alphabet_length (sizeof(integ_alphabet) - 1) + +#define FT_FILL(dst,cnt,type) \ + do { \ + int i, a = 0; \ + type *d = (dst); \ + for (i = 0; i < cnt; i++) { \ + d[i] = integ_alphabet[a]; \ + if (++a >= integ_alphabet_length) \ + a = 0; \ + } \ + } while (0); + +#define FT_FILL_COMPLEX(dst,cnt,type) \ + do { \ + int i, a = 0; \ + OFI_COMPLEX(type) *d = (dst); \ + for (i = 0; i < cnt; i++) { \ + ofi_complex_fill_##type (&d[i], \ + (type) integ_alphabet[a]); \ + if (++a >= integ_alphabet_length) \ + a = 0; \ + } \ + } while (0); + +#define FT_CHECK(buf,cmp,cnt,type) \ + do { \ + int i; \ + type *b = (buf); \ + type *c = (cmp); \ + for (i = 0; i < cnt; i++) { \ + if (b[i] != c[i]) \ + return -FI_EIO; \ + } \ + } while (0); + +#define FT_CHECK_COMPLEX(buf,cmp,cnt,type) \ + do { \ + int i; \ + OFI_COMPLEX(type) *b = (buf); \ + OFI_COMPLEX(type) *c = (cmp); \ + for (i = 0; i < cnt; i++) { \ + if (!ofi_complex_eq_##type (b[i], c[i])) \ + return -FI_EIO; \ + } \ + } while (0); + + +#ifdef HAVE___INT128 + +/* If __int128 supported, things just work. */ +#define FT_FILL_INT128(...) FT_FILL(__VA_ARGS__) +#define FT_CHECK_INT128(...) FT_CHECK(__VA_ARGS__) + +#else + +/* If __int128, we're not going to fill/verify. */ +#define FT_FILL_INT128(...) +#define FT_CHECK_INT128(...) + +#endif + +#define EXPAND( x ) x + +#define SWITCH_REAL_TYPES(type,FUNC,...) \ + switch (type) { \ + case FI_INT8: EXPAND( FUNC(__VA_ARGS__,int8_t) ); break; \ + case FI_UINT8: EXPAND( FUNC(__VA_ARGS__,uint8_t) ); break; \ + case FI_INT16: EXPAND( FUNC(__VA_ARGS__,int16_t) ); break; \ + case FI_UINT16: EXPAND( FUNC(__VA_ARGS__,uint16_t) ); break; \ + case FI_INT32: EXPAND( FUNC(__VA_ARGS__,int32_t) ); break; \ + case FI_UINT32: EXPAND( FUNC(__VA_ARGS__,uint32_t) ); break; \ + case FI_INT64: EXPAND( FUNC(__VA_ARGS__,int64_t) ); break; \ + case FI_UINT64: EXPAND( FUNC(__VA_ARGS__,uint64_t) ); break; \ + case FI_INT128: EXPAND( FUNC##_INT128(__VA_ARGS__,ofi_int128_t) ); break; \ + case FI_UINT128: EXPAND( FUNC##_INT128(__VA_ARGS__,ofi_uint128_t) ); break; \ + case FI_FLOAT: EXPAND( FUNC(__VA_ARGS__,float) ); break; \ + case FI_DOUBLE: EXPAND( FUNC(__VA_ARGS__,double) ); break; \ + case FI_LONG_DOUBLE: EXPAND( FUNC(__VA_ARGS__,long double) ); break; \ + default: return -FI_EOPNOTSUPP; \ + } + +#define SWITCH_COMPLEX_TYPES(type,FUNC,...) \ + switch (type) { \ + case FI_FLOAT_COMPLEX: EXPAND( FUNC(__VA_ARGS__,float) ); break; \ + case FI_DOUBLE_COMPLEX: EXPAND( FUNC(__VA_ARGS__,double) ); break; \ + case FI_LONG_DOUBLE_COMPLEX: EXPAND( FUNC(__VA_ARGS__,long_double) ); break;\ + default: return -FI_EOPNOTSUPP; \ + } + + #endif /* _SHARED_H_ */ diff --git a/fabtests/include/unix/osd.h b/fabtests/include/unix/osd.h index ec8ca1020fb..0c3200b0468 100644 --- a/fabtests/include/unix/osd.h +++ b/fabtests/include/unix/osd.h @@ -83,9 +83,11 @@ static inline int ofi_sockerr(void) return errno; } +typedef long double long_double; + /* complex operations implementation */ -#define OFI_COMPLEX(name) ofi_##name##_complex -#define OFI_COMPLEX_OP(name, op) ofi_complex_##name##_##op +#define OFI_COMPLEX(name) ofi_complex_##name +#define OFI_COMPLEX_OP(name, op) ofi_complex_##op##_##name #define OFI_COMPLEX_TYPE_DECL(name, type) typedef type complex OFI_COMPLEX(name); OFI_COMPLEX_TYPE_DECL(float, float) @@ -97,11 +99,11 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, sum)(OFI_COMPLEX(name) v1, { \ return v1 + v2; \ } \ -static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, mul)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, prod)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 * v2; \ } \ -static inline int OFI_COMPLEX_OP(name, equ)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline int OFI_COMPLEX_OP(name, eq)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 == v2; \ } \ @@ -112,6 +114,18 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, land)(OFI_COMPLEX(name) v1, static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 || v2; \ +} \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lxor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2)\ +{ \ + return (v1 && !v2) || (!v1 && v2); \ +} \ +static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX(name) v2) \ +{ \ + *v1 = v2; \ +} \ +static inline void OFI_COMPLEX_OP(name, fill)(OFI_COMPLEX(name) *v1, name v2) \ +{ \ + *v1 = (OFI_COMPLEX(name))((name)(v2) + I * (name)(v2)); \ } OFI_COMPLEX_OPS(float) diff --git a/fabtests/include/windows/osd.h b/fabtests/include/windows/osd.h index 564f4453c16..bc9fd781977 100644 --- a/fabtests/include/windows/osd.h +++ b/fabtests/include/windows/osd.h @@ -724,11 +724,12 @@ ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags) return (ssize_t) send(fd, (const char*) buf, len, flags); } +typedef long double long_double; /* complex operations implementation */ -#define OFI_COMPLEX(name) ofi_##name##_complex +#define OFI_COMPLEX(name) ofi_complex_##name #define OFI_COMPLEX_BASE(name) OFI_COMPLEX(name)##_base -#define OFI_COMPLEX_OP(name, op) ofi_complex_##name##_##op +#define OFI_COMPLEX_OP(name, op) ofi_complex_##op##_##name #define OFI_COMPLEX_TYPE_DECL(name, type) \ typedef type OFI_COMPLEX_BASE(name); \ typedef struct { \ @@ -754,29 +755,47 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, sum)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) ret = {.re = v1.re + v2.re, .im = v1.im + v2.im}; \ return ret; \ } \ -static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, mul)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, prod)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) ret = {.re = (v1.re * v2.re) - (v1.im * v2.im), \ .im = (v1.re * v2.im) + (v1.im * v2.re)}; \ return ret; \ } \ -static inline int OFI_COMPLEX_OP(name, equ)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline int OFI_COMPLEX_OP(name, eq)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1.re == v2.re && v1.im == v2.im; \ } \ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, land)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ - int equ = !OFI_COMPLEX_OP(name, equ)(v1, zero) && !OFI_COMPLEX_OP(name, equ)(v2, zero); \ + int equ = !OFI_COMPLEX_OP(name, eq)(v1, zero) && !OFI_COMPLEX_OP(name, eq)(v2, zero); \ OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ return ret; \ } \ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ - int equ = !OFI_COMPLEX_OP(name, equ)(v1, zero) || !OFI_COMPLEX_OP(name, equ)(v2, zero); \ + int equ = !OFI_COMPLEX_OP(name, eq)(v1, zero) || !OFI_COMPLEX_OP(name, eq)(v2, zero); \ OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ return ret; \ +} \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lxor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +{ \ + OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ + int equ = (!OFI_COMPLEX_OP(name, eq)(v1, zero) && OFI_COMPLEX_OP(name, eq)(v2, zero)) || \ + (OFI_COMPLEX_OP(name, eq)(v1, zero) && !OFI_COMPLEX_OP(name, eq)(v2, zero)); \ + OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ + return ret; \ +} \ +static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX(name) v2) \ +{ \ + v1->re = v2.re; \ + v1->im = v2.im; \ +} \ +static inline void OFI_COMPLEX_OP(name, fill)(OFI_COMPLEX(name) *v1, char v2) \ +{ \ + v1->re = v2; \ + v1->im = v2; \ } OFI_COMPLEX_OPS(float) diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index 20b200e3123..2cbfab0e5bc 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -50,9 +50,6 @@ features of libfabric. *fi_dgram* : A basic datagram endpoint example. -*fi_dgram_waitset* -: Transfers datagrams using waitsets for completion notification. - *fi_inj_complete* : Sends messages using the FI_INJECT_COMPLETE operation flag. @@ -80,10 +77,6 @@ features of libfabric. completion counters of inbound writes as the notification mechanism. -*fi_poll* -: Exchanges data over RDM endpoints using poll sets to drive - completion notifications. - *fi_rdm* : A basic RDM endpoint example. @@ -147,10 +140,13 @@ features of libfabric. buffer tries to remain the same. This test is used to validate the correct behavior of memory registration caches. -*fi_bw* -: Performs a one-sided bandwidth test with an option for data verification. - A sleep time on the receiving side can be enabled in order to allow - the sender to get ahead of the receiver. +*fi_flood* +: The test performs a one-sided transfer by utilizing Bulk Memory Region (MR) + registration and flooding the receiver with unexpected messages. This is + followed by sequential MR registration transfers, which force the MR cache + to evict the least recently used MRs before making new transfers. An optional + sleep time can be enabled on the receiving side to allow the sender to get + ahead of the receiver. *fi_rdm_multi_client* : Tests a persistent server communicating with multiple clients, one at a @@ -343,7 +339,7 @@ The following keys and respective key values may be used in the config file. FI_WRITE, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_TAGGED, FI_DIRECTED_RECV *mode - values OR'ed together* -: FI_CONTEXT, FI_RX_CQ_DATA +: FI_CONTEXT, FI_CONTEXT2, FI_RX_CQ_DATA *ep_type* : FI_EP_MSG, FI_EP_DGRAM, FI_EP_RDM diff --git a/fabtests/man/man1/fi_bw.1 b/fabtests/man/man1/fi_flood.1 similarity index 100% rename from fabtests/man/man1/fi_bw.1 rename to fabtests/man/man1/fi_flood.1 diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index e71d9c1f1ee..acc82feb924 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabtests" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabtests" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -48,9 +62,6 @@ Tranfers messages with CQ data. \f[I]fi_dgram\f[R] A basic datagram endpoint example. .TP -\f[I]fi_dgram_waitset\f[R] -Transfers datagrams using waitsets for completion notification. -.TP \f[I]fi_inj_complete\f[R] Sends messages using the FI_INJECT_COMPLETE operation flag. .TP @@ -79,10 +90,6 @@ Performs data transfers over multiple endpoints in parallel. Issues RMA write operations to multiple memory regions, using completion counters of inbound writes as the notification mechanism. .TP -\f[I]fi_poll\f[R] -Exchanges data over RDM endpoints using poll sets to drive completion -notifications. -.TP \f[I]fi_rdm\f[R] A basic RDM endpoint example. .TP @@ -149,10 +156,13 @@ tries to remain the same. This test is used to validate the correct behavior of memory registration caches. .TP -\f[I]fi_bw\f[R] -Performs a one-sided bandwidth test with an option for data -verification. -A sleep time on the receiving side can be enabled in order to allow the +\f[I]fi_flood\f[R] +The test performs a one-sided transfer by utilizing Bulk Memory Region +(MR) registration and flooding the receiver with unexpected messages. +This is followed by sequential MR registration transfers, which force +the MR cache to evict the least recently used MRs before making new +transfers. +An optional sleep time can be enabled on the receiving side to allow the sender to get ahead of the receiver. .TP \f[I]fi_rdm_multi_client\f[R] @@ -275,7 +285,7 @@ FI_ENORX) can be read by the application, if RNR happens. \f[I]fi_efa_rnr_queue_resend\f[R] This test modifies the RNR retry count (rnr_retry) to 0 via fi_setopt, and then tests RNR queue/re-send logic for different packet types. -To run the test, one needs to use \f[C]-c\f[R] option to specify the +To run the test, one needs to use \f[V]-c\f[R] option to specify the category of packet types. .SS Component tests .PP @@ -455,9 +465,9 @@ The default endpoint type is rdm. Allocate data buffers on the specified device, rather than in host memory. Valid options are ze, cuda and synapseai. +.TP *-a -.IP \[bu] 2 -: The name of a shared address vector. +The name of a shared address vector. This option only applies to tests that support shared address vectors. .TP \f[I]-B \f[R] @@ -469,9 +479,9 @@ endpoints to the server. .TP \f[I]-P \f[R] Specifies the port number of the peer endpoint, overriding the default. +.TP *-s -.IP \[bu] 2 -: Specifies the address of the local endpoint. +Specifies the address of the local endpoint. .TP *-F Specifies the address format. diff --git a/fabtests/multinode/src/core.c b/fabtests/multinode/src/core.c index dbc21cc42e6..d1770fe7a53 100644 --- a/fabtests/multinode/src/core.c +++ b/fabtests/multinode/src/core.c @@ -87,7 +87,7 @@ static int multi_setup_fabric(int argc, char **argv) struct fi_rma_iov remote; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; if (pm_job.transfer_method == multi_msg) { diff --git a/fabtests/multinode/src/core_coll.c b/fabtests/multinode/src/core_coll.c index 7d6b5ddfcf1..d9fe4dc0d09 100644 --- a/fabtests/multinode/src/core_coll.c +++ b/fabtests/multinode/src/core_coll.c @@ -524,7 +524,7 @@ static inline void setup_hints(void) { hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_COLLECTIVE; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; } diff --git a/fabtests/multinode/src/harness.c b/fabtests/multinode/src/harness.c index 0df5417521e..959a5940387 100644 --- a/fabtests/multinode/src/harness.c +++ b/fabtests/multinode/src/harness.c @@ -355,7 +355,7 @@ int main(int argc, char **argv) opts = INIT_OPTS; opts.options |= FT_OPT_SIZE | FT_OPT_OOB_ADDR_EXCH | - FT_OPT_DISABLE_TAG_VALIDATION; + FT_OPT_ADDR_IS_OOB | FT_OPT_DISABLE_TAG_VALIDATION; pm_job.clients = NULL; pm_job.pattern = -1; diff --git a/fabtests/prov/efa/Makefile.include b/fabtests/prov/efa/Makefile.include index 895885e9e54..f9d2d343354 100644 --- a/fabtests/prov/efa/Makefile.include +++ b/fabtests/prov/efa/Makefile.include @@ -30,11 +30,15 @@ # SOFTWARE. # +if ENABLE_EFA bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \ prov/efa/src/fi_efa_rnr_queue_resend \ prov/efa/src/fi_efa_info_test if HAVE_VERBS_DEVEL bin_PROGRAMS += prov/efa/src/fi_efa_exhaust_mr_reg_rdm_pingpong +if BUILD_EFA_RDMA_CHECKER +bin_PROGRAMS += prov/efa/src/fi_efa_rdma_checker +endif BUILD_EFA_RDMA_CHECKER endif HAVE_VERBS_DEVEL efa_rnr_srcs = \ @@ -65,4 +69,13 @@ prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_SOURCES = \ $(efa_exhaust_mr_reg_srcs) \ $(benchmarks_srcs) prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_LDADD = libfabtests.la + +if BUILD_EFA_RDMA_CHECKER +prov_efa_src_fi_efa_rdma_checker_SOURCES = \ + prov/efa/src/efa_rdma_checker.c +prov_efa_src_fi_efa_rdma_checker_LDADD = libfabtests.la +prov_efa_src_fi_efa_rdma_checker_LDFLAGS = -lefa +endif BUILD_EFA_RDMA_CHECKER + endif HAVE_VERBS_DEVEL +endif ENABLE_EFA diff --git a/fabtests/prov/efa/configure.m4 b/fabtests/prov/efa/configure.m4 new file mode 100644 index 00000000000..b8252b209f5 --- /dev/null +++ b/fabtests/prov/efa/configure.m4 @@ -0,0 +1,37 @@ +dnl +dnl SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +dnl SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. +dnl +dnl Configure specific to the fabtests Amazon EFA provider + + +dnl Checks for presence of efadv verbs. Needed for building tests that calls efadv verbs. +have_efadv=0 +AC_CHECK_HEADER([infiniband/efadv.h], + [AC_CHECK_LIB(efa, efadv_query_device, + [have_efadv=1])]) + +efa_rdma_checker_happy=0 +AS_IF([test x"$have_efadv" = x"1"], [ + efa_rdma_checker_happy=1 + AC_CHECK_MEMBER(struct efadv_device_attr.max_rdma_size, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) + + AC_CHECK_MEMBER(struct efadv_device_attr.device_caps, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) + + AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) + + AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) +]) +AM_CONDITIONAL([BUILD_EFA_RDMA_CHECKER], [test $efa_rdma_checker_happy -eq 1]) diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c index 9cde8bc43a3..72da2063d7a 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c @@ -79,9 +79,8 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; ret = ft_init_fabric(); diff --git a/fabtests/prov/efa/src/efa_rdma_checker.c b/fabtests/prov/efa/src/efa_rdma_checker.c new file mode 100644 index 00000000000..9215cc63d91 --- /dev/null +++ b/fabtests/prov/efa/src/efa_rdma_checker.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +enum rdma_op { + READ, + WRITE, + UNSOLICITED_WRITE_RECV, +}; + +/* + * Check whether rdma read/write is enabled on the instance by querying the rdma device. + * Return 0 if rdma read/write is enabled, otherwise return -1. + */ +int main(int argc, char *argv[]) +{ + struct ibv_device **device_list; + struct ibv_context *ibv_ctx; + struct ibv_device_attr_ex ibv_dev_attr = {0}; + struct efadv_device_attr efadv_attr = {0}; + int dev_cnt; + int err, opt; + enum rdma_op op = READ; + + while ((opt = getopt(argc, argv, "ho:")) != -1) { + switch (opt) { + case 'o': + if (!strcasecmp(optarg, "read")) { + op = READ; + } else if (!strcasecmp(optarg, "write")) { + op = WRITE; + } else if (!strcasecmp(optarg, "writedata")) { + op = UNSOLICITED_WRITE_RECV; + } else { + fprintf(stderr, "Unknown operation '%s. Allowed: read | write | writedata '\n", optarg); + return EXIT_FAILURE; + } + break; + case '?': + case 'h': + default: + fprintf(stderr, "Usage:\n"); + FT_PRINT_OPTS_USAGE("fi_efa_rdma_checker -o ", "rdma operation type: read | write | writedata"); + return EXIT_FAILURE; + } + } + + device_list = ibv_get_device_list(&dev_cnt); + if (dev_cnt <= 0) { + fprintf(stderr, "No ibv device found!\n"); + return -ENODEV; + } + + ibv_ctx = ibv_open_device(device_list[0]); + if (!ibv_ctx) { + fprintf(stderr, "cannot open device %d\n", 0); + return EXIT_FAILURE; + } + + err = ibv_query_device_ex(ibv_ctx, NULL, &ibv_dev_attr); + if (!err) { + fprintf(stdout, "ibv_dev_attr.device_cap_flags_ex: %lx\n", ibv_dev_attr.device_cap_flags_ex); + } + + err = efadv_query_device(ibv_ctx, (struct efadv_device_attr *)&efadv_attr, sizeof(efadv_attr)); + ibv_close_device(ibv_ctx); + if (err) { + fprintf(stderr, "cannot query device\n"); + goto out; + } + + if (efadv_attr.max_rdma_size == 0) { + fprintf(stderr, "rdma is not enabled \n"); + err = EXIT_FAILURE; + goto out; + } + fprintf(stdout, "rdma read is enabled \n"); + fprintf(stdout, "efa_dev_attr.max_rdma_size: %d\n", efadv_attr.max_rdma_size); + + if (op == READ) + goto out; + + if (op == WRITE) { + if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE) { + fprintf(stdout, "rdma write is enabled \n"); + } else { + fprintf(stderr, "rdma write is NOT enabled \n"); + err = 1; + } + goto out; + } + + if (op == UNSOLICITED_WRITE_RECV) { + if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV) { + fprintf(stdout, + "rdma unsolicited write recv is enabled \n"); + } else { + fprintf(stderr, "rdma unsolicited write recv is NOT " + "enabled \n"); + err = 1; + } + } + +out: + ibv_free_device_list(device_list); + return err; +} diff --git a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c index 9a8889ca4cf..31139165d45 100644 --- a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c +++ b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c @@ -146,7 +146,7 @@ static int trigger_rnr_queue_resend(enum fi_op atomic_op, void *result, void *co struct fid_mr *mr_result, struct fid_mr *mr_compare) { int i, ret; - struct fi_context fi_ctx_atomic; + struct fi_context2 fi_ctx_atomic; if (opts.rma_op) { for (i = 0; i < global_expected_rnr_error; i++) { @@ -434,7 +434,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps |= FI_MSG | FI_RMA | FI_ATOMICS; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; /* FI_RM_ENABLED to is required for queue/resend logic to happen in RNR case */ diff --git a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c index 1eb11acdbaa..85e0e67db1d 100644 --- a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c +++ b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c @@ -43,7 +43,6 @@ static int rnr_read_cq_error(void) { int total_send, expected_rnr_error; int ret, i, cnt, rnr_flag; - const char *prov_errmsg; expected_rnr_error = fi->rx_attr->size; rnr_flag = 0; @@ -89,16 +88,6 @@ static int rnr_read_cq_error(void) rnr_flag = 1; printf("Got RNR error CQ entry as expected: %d, %s\n", comp_err.err, fi_strerror(comp_err.err)); - prov_errmsg = fi_cq_strerror(txcq, comp_err.prov_errno, - comp_err.err_data, - comp_err.buf, - comp_err.len); - if (strstr(prov_errmsg, "Destination resource not ready") == NULL) { - printf("Got unexpected provider error message.\n"); - printf(" Expected error message to have \"Destination resource not ready\" in it\n"); - printf(" Got: %s\n", prov_errmsg); - return -FI_EINVAL; - } } else { printf("Got non-RNR error CQ entry: %d, %s\n", comp_err.err, fi_strerror(comp_err.err)); @@ -187,7 +176,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; /* FI_RM_DISABLED is required to get RNR error CQ entry */ diff --git a/fabtests/prov/lpp/Makefile.include b/fabtests/prov/lpp/Makefile.include index 1d831f2372d..ac7d69b0e30 100644 --- a/fabtests/prov/lpp/Makefile.include +++ b/fabtests/prov/lpp/Makefile.include @@ -30,6 +30,8 @@ # SOFTWARE. # +if ENABLE_LPP + LPP_REGRESSION_SRCS = prov/lpp/src/rcq_data.c \ prov/lpp/src/main.c \ prov/lpp/src/ipc.c \ @@ -64,3 +66,5 @@ endif prov_lpp_src_lpp_regression_SOURCES = $(LPP_REGRESSION_SRCS) prov_lpp_src_lpp_regression_LDADD = libfabtests.la + +endif ENABLE_LPP diff --git a/fabtests/prov/lpp/src/atomic.c b/fabtests/prov/lpp/src/atomic.c index 0603d045ab2..ba8b2966402 100644 --- a/fabtests/prov/lpp/src/atomic.c +++ b/fabtests/prov/lpp/src/atomic.c @@ -32,7 +32,7 @@ #include "test_util.h" -const static uint64_t context = 0xabce; +static const uint64_t context = 0xabce; int run_simple_atomic_write(struct rank_info *ri) { diff --git a/fabtests/prov/lpp/src/main.c b/fabtests/prov/lpp/src/main.c index 56eec613958..f27cce46234 100644 --- a/fabtests/prov/lpp/src/main.c +++ b/fabtests/prov/lpp/src/main.c @@ -78,7 +78,7 @@ enum node_id my_node; // Note: the two large RMA tests are intentionally far apart to reduce the // chances they run simultaneously. On configs with small IOVAs spaces, this // can be a problem. This only matters when running with -p > 1, of course. -const static struct test testlist[] = { +static const struct test testlist[] = { { run_simple_rma_write, "simple_rma_write" }, { run_offset_rma_write, "offset_rma_write" }, { run_inject_rma_write, "inject_rma_write" }, @@ -273,7 +273,7 @@ static void *worker_thread(void *arg) return (void*)1; } -static void inline populate_filtered_testlist(const struct test* tlist, +static inline void populate_filtered_testlist(const struct test* tlist, size_t num_tests) { for (int i = 0; i < num_tests; i++) { @@ -320,6 +320,7 @@ static void run_tests(int parallel) // iteration. ret = pthread_barrier_init(&_barrier, NULL, nthreads + 1); assert(ret == 0); + (void) ret; /* suppress compiler warning for non-debug build */ pthread_t *threads = calloc(nthreads, sizeof(pthread_t)); assert(threads); diff --git a/fabtests/prov/lpp/src/msg.c b/fabtests/prov/lpp/src/msg.c index f1e01b79ae2..a9fadead52d 100644 --- a/fabtests/prov/lpp/src/msg.c +++ b/fabtests/prov/lpp/src/msg.c @@ -34,7 +34,7 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; int run_simple_msg(struct rank_info *ri) { diff --git a/fabtests/prov/lpp/src/rcq_data.c b/fabtests/prov/lpp/src/rcq_data.c index 9a6c14c1472..a3ca76f8d03 100644 --- a/fabtests/prov/lpp/src/rcq_data.c +++ b/fabtests/prov/lpp/src/rcq_data.c @@ -1,6 +1,7 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; +#define BUF_NUM 4 int run_fi_tsenddata(struct rank_info *ri){ struct wait_tx_cq_params wait_tx_cq_params = { 0 }; @@ -8,23 +9,16 @@ int run_fi_tsenddata(struct rank_info *ri){ struct verify_buf_params verify_buf_params = { 0 }; struct mr_params mr_params = { 0 }; struct ep_params ep_params = { 0 }; - const size_t buff_lens[] = { (1<<15), (1<<14), 1024, 64 }; + const size_t buff_lens[BUF_NUM] = { (1<<15), (1<<14), 1024, 64 }; + const uint64_t tags[BUF_NUM] = {0xffff0001, 0xffff0002, 0xffff0003, 0xffff0004}; + uint64_t rcq_data[BUF_NUM] = { 0x1000, 0x2000, 0x3000, 0x4000}; struct rank_info *pri = NULL; - const uint64_t tags[] = {0xffff0001, 0xffff0002, 0xffff0003, 0xffff0004}; - uint64_t rcq_data[] = { 0x1000, 0x2000, 0x3000, 0x4000}; - size_t ndata = sizeof(rcq_data)/sizeof(*rcq_data); - size_t nbufflens = sizeof(buff_lens)/sizeof(*buff_lens); - - if (ndata == nbufflens) - return -EINVAL; - - for(int i = 0; iiteration; - } TRACE(ri, util_init(ri)); - for(int i = 0; i < ndata; i++){ + for (int i = 0; i < BUF_NUM; i++) { mr_params.idx = i; mr_params.length = buff_lens[i]; mr_params.access = FI_SEND | FI_RECV; @@ -34,10 +28,9 @@ int run_fi_tsenddata(struct rank_info *ri){ ep_params.idx = 0; TRACE(ri, util_create_ep(ri, &ep_params)); - TRACE(ri, util_sync(ri, &pri)); - for(int i= 0; iep_info[0].fid, ri->mr_info[i].uaddr, diff --git a/fabtests/prov/lpp/src/rma.c b/fabtests/prov/lpp/src/rma.c index 3ef52f54998..e7332155fbf 100644 --- a/fabtests/prov/lpp/src/rma.c +++ b/fabtests/prov/lpp/src/rma.c @@ -32,7 +32,7 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; static int simple_rma_write_common(struct rank_info *ri, size_t buffer_len) { diff --git a/fabtests/prov/lpp/src/test_util.c b/fabtests/prov/lpp/src/test_util.c index af54db7327f..2d3b55204f0 100644 --- a/fabtests/prov/lpp/src/test_util.c +++ b/fabtests/prov/lpp/src/test_util.c @@ -124,6 +124,7 @@ void util_init(struct rank_info *ri) hints.domain_attr = &domain_attr; hints.ep_attr->type = FI_EP_RDM; + hints.ep_attr->protocol = FI_PROTO_LPP; // TODO: Run some tests with more surgical application of caps (e.g., // only FI_MSG and FI_SEND for the sending side endpoint). hints.caps = FI_ATOMIC | FI_RMA | FI_MSG | FI_TAGGED | FI_READ | @@ -131,15 +132,16 @@ void util_init(struct rank_info *ri) hints.mode = 0; hints.fabric_attr->prov_name = "lpp"; - hints.domain_attr->mr_mode = FI_MR_BASIC; + hints.domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP; - rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &ri->fi); + rc = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, 0, &hints, &ri->fi); if (rc == -ENODATA) { warn("Failed to find provider with FI_HMEM, trying again without\n"); hints.caps &= ~FI_HMEM; INSIST_FI_EQ(ri, - fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, - &hints, &ri->fi), + fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, 0, &hints, &ri->fi), 0); } diff --git a/fabtests/prov/lpp/src/test_util.h b/fabtests/prov/lpp/src/test_util.h index 7946acc8596..6f4cb4143d0 100644 --- a/fabtests/prov/lpp/src/test_util.h +++ b/fabtests/prov/lpp/src/test_util.h @@ -167,8 +167,8 @@ static inline struct fi_context *get_ctx_simple(struct rank_info *ri, } void free_ctx_tree(struct rank_info *ri); -const static unsigned int seed_node_a = 1234; -const static unsigned int seed_node_b = 9876; +static const unsigned int seed_node_a = 1234; +static const unsigned int seed_node_b = 9876; #ifdef USE_HMEM void hmem_init(void); diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py index a6f50fcc9f4..ef3ad8b22da 100644 --- a/fabtests/pytest/common.py +++ b/fabtests/pytest/common.py @@ -68,7 +68,7 @@ def num_cuda_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_devices(ip): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -84,7 +84,7 @@ def num_neuron_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_cores_on_device(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id): @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def is_neuron_device_available(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable, if "PYTEST_XDIST_WORKER" in os.environ: worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", "")) hmem_device_id = worker_id % num_hmem - if host_memory_type == "cuda": - command += " -i {}".format(hmem_device_id) - else: - assert host_memory_type == "neuron" - num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + else: + hmem_device_id = 0 + + if host_memory_type == "cuda": + command += " -i {}".format(hmem_device_id) + else: + assert host_memory_type == "neuron" + num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + if command_type == "server": additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( hmem_device_id * num_cores) - wait_until_neuron_device_available(host_ip, hmem_device_id) + else: + additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( + hmem_device_id * num_cores + 1) + wait_until_neuron_device_available(host_ip, hmem_device_id) - if self._cmdline_args.provider == "efa": - import efa.efa_common - efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) - command += " -d {}-rdm".format(efa_device) + if self._cmdline_args.provider == "efa": + import efa.efa_common + efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) + command += " -d {}-rdm".format(efa_device) return command, additional_environment diff --git a/fabtests/pytest/default/test_dgram.py b/fabtests/pytest/default/test_dgram.py index af118f6fcab..3cffa601532 100644 --- a/fabtests/pytest/default/test_dgram.py +++ b/fabtests/pytest/default/test_dgram.py @@ -12,12 +12,6 @@ def test_dgram(cmdline_args): test = ClientServerTest(cmdline_args, "fi_dgram") test.run() -@pytest.mark.functional -def test_dgram_waitset(cmdline_args): - from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_dgram_waitset") - test.run() - @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) diff --git a/fabtests/pytest/default/test_poll.py b/fabtests/pytest/default/test_poll.py deleted file mode 100644 index a3aa6ec35fc..00000000000 --- a/fabtests/pytest/default/test_poll.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -@pytest.mark.functional -@pytest.mark.parametrize("poll_type", ["queue", "counter"]) -def test_poll(cmdline_args, poll_type): - from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_poll -t " + poll_type) - test.run() - diff --git a/fabtests/pytest/default/test_rdm.py b/fabtests/pytest/default/test_rdm.py index 1978006db21..0236fbd8b79 100644 --- a/fabtests/pytest/default/test_rdm.py +++ b/fabtests/pytest/default/test_rdm.py @@ -39,7 +39,7 @@ def test_rdm_shared_av(cmdline_args): @pytest.mark.functional def test_rdm_bw_functional(cmdline_args, completion_semantic): from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_bw -e rdm -v -T 1", completion_semantic=completion_semantic) + test = ClientServerTest(cmdline_args, "fi_flood -e rdm -v -T 1", completion_semantic=completion_semantic) test.run() @pytest.mark.parametrize("iteration_type", diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py index 8874b53a0a7..2871a9b8ca9 100644 --- a/fabtests/pytest/efa/conftest.py +++ b/fabtests/pytest/efa/conftest.py @@ -1,16 +1,54 @@ import pytest +from efa_common import has_rdma +# The memory types for bi-directional tests. +memory_type_list_bi_dir = [ + pytest.param("host_to_host"), + pytest.param("host_to_cuda", marks=pytest.mark.cuda_memory), + pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory), + pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory), + pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory), +] -@pytest.fixture(scope="module", params=["host_to_host", - pytest.param("host_to_cuda", marks=pytest.mark.cuda_memory), - pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory), - pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory), - pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory), - pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory), - pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory)]) +# Add more memory types that are useful for uni-directional tests. +memory_type_list_all = memory_type_list_bi_dir + [ + pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory), + pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory), +] + +@pytest.fixture(scope="module", params=memory_type_list_all) def memory_type(request): return request.param +@pytest.fixture(scope="module", params=memory_type_list_bi_dir) +def memory_type_bi_dir(request): + return request.param + +@pytest.fixture(scope="module", params=["read", "writedata", "write"]) +def rma_operation_type(request): + return request.param + +@pytest.fixture(scope="module") +def rma_bw_memory_type(memory_type, rma_operation_type): + is_test_bi_dir = False if rma_operation_type == "writedata" else True + if is_test_bi_dir and (memory_type not in [_.values[0] for _ in memory_type_list_bi_dir]): + pytest.skip("Duplicated memory type for bi-directional test") + return memory_type + +@pytest.fixture(scope="function") +def rma_bw_completion_semantic(cmdline_args, completion_semantic, rma_operation_type): + if completion_semantic != 'delivery_complete': + # There is no difference between DC and non-DC for read as it's + # not a transmission + if rma_operation_type == 'read': + pytest.skip("Duplicate completion semantic for fi_read test") + assert rma_operation_type in ['write', 'writedata'] + # If device support rdma write, all the transmissions are DC + if has_rdma(cmdline_args, 'write'): + pytest.skip("Duplicate completion semantic for fi_write* test") + return completion_semantic + + @pytest.fixture(scope="module", params=["r:0,4,64", "r:4048,4,4148", "r:8000,4,9000", diff --git a/fabtests/pytest/efa/efa_common.py b/fabtests/pytest/efa/efa_common.py index 4f5da4faf02..6f5e311a97f 100644 --- a/fabtests/pytest/efa/efa_common.py +++ b/fabtests/pytest/efa/efa_common.py @@ -1,3 +1,4 @@ +import os import subprocess import functools from common import SshConnectionError, is_ssh_connection_error, has_ssh_connection_err_msg, ClientServerTest @@ -66,6 +67,29 @@ def has_gdrcopy(hostname): process = subprocess.run(command, shell=True, check=False, stdout=subprocess.PIPE) return process.returncode == 0 +def has_rdma(cmdline_args, operation): + """ + determine whether a host has rdma enabled in efa device + hostname: a host + operation: rdma operation name, allowed values are read and write + return: a boolean + """ + assert operation in ["read", "write", "writedata"] + binpath = cmdline_args.binpath or "" + cmd = "timeout " + str(cmdline_args.timeout) \ + + " " + os.path.join(binpath, f"fi_efa_rdma_checker -o {operation}") + if cmdline_args.environments: + cmd = cmdline_args.environments + " " + cmd + proc = subprocess.run("ssh {} {}".format(cmdline_args.server_id, cmd), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + universal_newlines=True) + if has_ssh_connection_err_msg(proc.stdout): + raise SshConnectionError() + + return proc.returncode == 0 + def efa_retrieve_gid(hostname): """ return the GID of efa device on a host diff --git a/fabtests/pytest/efa/test_efa_protocol_selection.py b/fabtests/pytest/efa/test_efa_protocol_selection.py index 76212febc10..949f2982304 100644 --- a/fabtests/pytest/efa/test_efa_protocol_selection.py +++ b/fabtests/pytest/efa/test_efa_protocol_selection.py @@ -1,6 +1,6 @@ import pytest -from efa.efa_common import has_gdrcopy +from efa.efa_common import has_gdrcopy, has_rdma # TODO Expand this test to run on all memory types (and rename) @@ -17,6 +17,9 @@ def test_transfer_with_read_protocol_cuda(cmdline_args, fabtest_name, cntrl_env_ from common import has_cuda, has_hmem_support from efa.efa_common import efa_run_client_server_test, efa_retrieve_hw_counter_value + if cntrl_env_var == "FI_EFA_INTER_MIN_READ_WRITE_SIZE" and has_rdma(cmdline_args, "write"): + pytest.skip("FI_EFA_INTER_MIN_READ_WRITE_SIZE is only applied to emulated write protocols") + if cmdline_args.server_id == cmdline_args.client_id: pytest.skip("No read for intra-node communication") diff --git a/fabtests/pytest/efa/test_flood_peer.py b/fabtests/pytest/efa/test_flood_peer.py index d49cfdd1c63..ee321e007f2 100644 --- a/fabtests/pytest/efa/test_flood_peer.py +++ b/fabtests/pytest/efa/test_flood_peer.py @@ -3,6 +3,6 @@ @pytest.mark.functional def test_flood_peer(cmdline_args): from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_bw -e rdm -W 6400 -S 512 -T 5", + test = ClientServerTest(cmdline_args, "fi_flood -e rdm -W 6400 -S 512 -T 5", timeout=300) test.run() diff --git a/fabtests/pytest/efa/test_multi_ep.py b/fabtests/pytest/efa/test_multi_ep.py index 561919f1446..634529f0067 100644 --- a/fabtests/pytest/efa/test_multi_ep.py +++ b/fabtests/pytest/efa/test_multi_ep.py @@ -6,6 +6,6 @@ def test_multi_ep(cmdline_args, shared_cq): from common import ClientServerTest cmd = "fi_multi_ep -e rdm" if shared_cq: - cmd += " --shared-cq" + cmd += " -Q" test = ClientServerTest(cmdline_args, cmd) test.run() diff --git a/fabtests/pytest/efa/test_rdm.py b/fabtests/pytest/efa/test_rdm.py index d1a553abca7..d42dc6dea56 100644 --- a/fabtests/pytest/efa/test_rdm.py +++ b/fabtests/pytest/efa/test_rdm.py @@ -9,21 +9,21 @@ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): +def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type_bi_dir, completion_type): command = "fi_rdm_pingpong" + " " + perf_progress_model_cli efa_run_client_server_test(cmdline_args, command, iteration_type, - completion_semantic, memory_type, "all", completion_type=completion_type) + completion_semantic, memory_type_bi_dir, "all", completion_type=completion_type) @pytest.mark.functional @pytest.mark.serial -def test_mr_exhaustion_rdm_pingpong(cmdline_args): +def test_mr_exhaustion_rdm_pingpong(cmdline_args, completion_semantic): efa_run_client_server_test(cmdline_args, "fi_efa_exhaust_mr_reg_rdm_pingpong", "short", - "transmit_complete", "host_to_host", "all", timeout=1000) + completion_semantic, "host_to_host", "all", timeout=1000) @pytest.mark.functional -def test_rdm_pingpong_range(cmdline_args, completion_semantic, memory_type, message_size): +def test_rdm_pingpong_range(cmdline_args, completion_semantic, memory_type_bi_dir, message_size): efa_run_client_server_test(cmdline_args, "fi_rdm_pingpong", "short", - completion_semantic, memory_type, message_size) + completion_semantic, memory_type_bi_dir, message_size) @pytest.mark.functional def test_rdm_pingpong_no_inject_range(cmdline_args, completion_semantic, inject_message_size): @@ -33,15 +33,15 @@ def test_rdm_pingpong_no_inject_range(cmdline_args, completion_semantic, inject_ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): +def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type_bi_dir, completion_type): command = "fi_rdm_tagged_pingpong" + " " + perf_progress_model_cli efa_run_client_server_test(cmdline_args, command, iteration_type, - completion_semantic, memory_type, "all", completion_type=completion_type) + completion_semantic, memory_type_bi_dir, "all", completion_type=completion_type) @pytest.mark.functional -def test_rdm_tagged_pingpong_range(cmdline_args, completion_semantic, memory_type, message_size): +def test_rdm_tagged_pingpong_range(cmdline_args, completion_semantic, memory_type_bi_dir, message_size): efa_run_client_server_test(cmdline_args, "fi_rdm_tagged_pingpong", "short", - completion_semantic, memory_type, message_size) + completion_semantic, memory_type_bi_dir, message_size) @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), @@ -116,13 +116,13 @@ def test_rdm_pingpong_1G(cmdline_args, completion_semantic): memory_type="host_to_host", warmup_iteration_type=0) @pytest.mark.functional -def test_rdm_pingpong_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): +def test_rdm_pingpong_zcpy_recv(cmdline_args, memory_type_bi_dir, zcpy_recv_max_msg_size, zcpy_recv_message_size): if cmdline_args.server_id == cmdline_args.client_id: pytest.skip("no zero copy recv for intra-node communication") cmdline_args_copy = copy.copy(cmdline_args) cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_pingpong --max-msg-size {zcpy_recv_max_msg_size}", - "short", "transmit_complete", memory_type, zcpy_recv_message_size) + "short", "transmit_complete", memory_type_bi_dir, zcpy_recv_message_size) @pytest.mark.functional def test_rdm_bw_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): @@ -132,3 +132,12 @@ def test_rdm_bw_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcp cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_bw --max-msg-size {zcpy_recv_max_msg_size}", "short", "transmit_complete", memory_type, zcpy_recv_message_size) + +@pytest.mark.functional +def test_rdm_bw_zcpy_recv_use_fi_more(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): + if cmdline_args.server_id == cmdline_args.client_id: + pytest.skip("no zero copy recv for intra-node communication") + cmdline_args_copy = copy.copy(cmdline_args) + cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") + efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_bw --use-fi-more --max-msg-size {zcpy_recv_max_msg_size}", + "short", "transmit_complete", memory_type, zcpy_recv_message_size) diff --git a/fabtests/pytest/efa/test_rma_bw.py b/fabtests/pytest/efa/test_rma_bw.py index 58f26367c7f..98ff0a3b0d2 100644 --- a/fabtests/pytest/efa/test_rma_bw.py +++ b/fabtests/pytest/efa/test_rma_bw.py @@ -4,48 +4,44 @@ import copy -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_bw(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): +def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, rma_bw_completion_semantic, rma_bw_memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + " " + perf_progress_model_cli + command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args, command, iteration_type, rma_bw_completion_semantic, rma_bw_memory_type, "all", timeout=timeout) -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) @pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) -def test_rma_bw_small_tx_rx(cmdline_args, operation_type, completion_semantic, memory_type, env_vars): +def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, rma_bw_completion_semantic, rma_bw_memory_type, env_vars): cmdline_args_copy = copy.copy(cmdline_args) for env_var in env_vars: cmdline_args_copy.append_environ(env_var) # Use a window size larger than tx/rx size command = "fi_rma_bw -e rdm -W 128" - command = command + " -o " + operation_type + " " + perf_progress_model_cli + command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args_copy.timeout) - efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args_copy, command, "short", rma_bw_completion_semantic, rma_bw_memory_type, "all", timeout=timeout) @pytest.mark.functional -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) -def test_rma_bw_range(cmdline_args, operation_type, completion_semantic, message_size, memory_type): +def test_rma_bw_range(cmdline_args, rma_operation_type, rma_bw_completion_semantic, message_size, rma_bw_memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, message_size, timeout=timeout) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, rma_bw_memory_type, message_size, timeout=timeout) @pytest.mark.functional -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) -def test_rma_bw_range_no_inject(cmdline_args, operation_type, completion_semantic, inject_message_size): +def test_rma_bw_range_no_inject(cmdline_args, rma_operation_type, rma_bw_completion_semantic, inject_message_size): command = "fi_rma_bw -e rdm -j 0" - command = command + " -o " + operation_type + command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, "host_to_host", inject_message_size, timeout=timeout) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, "host_to_host", inject_message_size, timeout=timeout) # This test is run in serial mode because it takes a lot of memory @@ -53,22 +49,22 @@ def test_rma_bw_range_no_inject(cmdline_args, operation_type, completion_semanti @pytest.mark.functional # TODO Add "writedata", "write" back in when EFA firmware bug is fixed @pytest.mark.parametrize("operation_type", ["read"]) -def test_rma_bw_1G(cmdline_args, operation_type, completion_semantic): +def test_rma_bw_1G(cmdline_args, operation_type, rma_bw_completion_semantic): # Default window size is 64 resulting in 128GB being registered, which # exceeds max number of registered host pages timeout = max(540, cmdline_args.timeout) command = "fi_rma_bw -e rdm -W 1" command = command + " -o " + operation_type efa_run_client_server_test(cmdline_args, command, 2, - completion_semantic=completion_semantic, message_size=1073741824, + completion_semantic=rma_bw_completion_semantic, message_size=1073741824, memory_type="host_to_host", warmup_iteration_type=0, timeout=timeout) @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata", "write"]) -def test_rma_bw_use_fi_more(cmdline_args, operation_type, completion_semantic, inject_message_size): +def test_rma_bw_use_fi_more(cmdline_args, operation_type, rma_bw_completion_semantic, inject_message_size): command = "fi_rma_bw -e rdm -j 0 --use-fi-more" command = command + " -o " + operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, "host_to_host", inject_message_size, timeout=timeout) diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py index 29afcf4e062..7d028f9a09a 100644 --- a/fabtests/pytest/efa/test_rma_pingpong.py +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -14,23 +14,23 @@ def rma_pingpong_message_size(request): @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): +def test_rma_pingpong(cmdline_args, iteration_type, operation_type, rma_bw_completion_semantic, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type + " " + perf_progress_model_cli - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all") + efa_run_client_server_test(cmdline_args, command, iteration_type, rma_bw_completion_semantic, memory_type_bi_dir, "all") @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type): +def test_rma_pingpong_range(cmdline_args, operation_type, rma_bw_completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type): +def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, rma_bw_completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm -j 0" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) diff --git a/fabtests/pytest/efa/test_runt.py b/fabtests/pytest/efa/test_runt.py index 8735298ad49..701406be26d 100644 --- a/fabtests/pytest/efa/test_runt.py +++ b/fabtests/pytest/efa/test_runt.py @@ -74,16 +74,15 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method): if copy_method == "localread": # when local read copy is used, server issue RDMA requests to copy received data # - # so in this case, total read wr is 11, which is + # so in this case, total read wr is at least 9, which is # 1 remote read of 192k # 8 local read for the 64k data transfer by send - # 2 local read for 2 fabtests control messages + # More local reads for fabtests control messages # - # and total read_bytes will be 262149, which is: - # 256k message + 2 fabtests control messages (1 byte and 4 byte each) + # and total read_bytes will be >= 256K including the control messages # - assert server_read_wrs == 11 - assert server_read_bytes == 262149 + assert server_read_wrs >= 9 + assert server_read_bytes >= 262144 else: # The other 192 KB is transfer by RDMA read # for which the server (receiver) will issue 1 read request. diff --git a/fabtests/pytest/efa/test_unexpected_msg.py b/fabtests/pytest/efa/test_unexpected_msg.py index f183a0a7566..dc1f93e3c3c 100644 --- a/fabtests/pytest/efa/test_unexpected_msg.py +++ b/fabtests/pytest/efa/test_unexpected_msg.py @@ -20,6 +20,12 @@ def test_unexpected_msg(cmdline_args, msg_size, msg_count, memory_type, completi neuron_maximal_buffer_size = 2**32 if "neuron" in memory_type and allocated_memory >= neuron_maximal_buffer_size: pytest.skip("Cannot hit neuron allocation limit") + + # The EFA limit for single MR that enables remote write is 1M pages aka 4GB for regular pages + maximal_mr_size = 2**32 + if allocated_memory >= maximal_mr_size: + pytest.skip("Cannot hit EFA MR limit") + efa_run_client_server_test(cmdline_args, f"fi_unexpected_msg -e rdm -M {msg_count}", iteration_type="short", completion_semantic=completion_semantic, memory_type=memory_type, message_size=msg_size, completion_type="queue", timeout=1800) diff --git a/fabtests/regression/sighandler_test.c b/fabtests/regression/sighandler_test.c index 84cf532fd4a..dc54fd98a2c 100644 --- a/fabtests/regression/sighandler_test.c +++ b/fabtests/regression/sighandler_test.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) } } hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; if (ft_init_fabric()) { ft_freehints(hints); exit(EXIT_FAILURE); diff --git a/fabtests/scripts/runfabtests.cmd b/fabtests/scripts/runfabtests.cmd index 2086ac42340..5fb9b3833da 100644 --- a/fabtests/scripts/runfabtests.cmd +++ b/fabtests/scripts/runfabtests.cmd @@ -66,8 +66,6 @@ set functional_tests=^ "msg"^ "msg_epoll"^ "msg_sockets"^ - "poll -t queue"^ - "poll -t counter"^ "rdm"^ "rdm -U"^ "rdm_tagged_peek"^ @@ -76,9 +74,9 @@ set functional_tests=^ "inject_test -N -A inject -v"^ "inject_test -A inj_complete -v"^ "inject_test -N -A inj_complete -v"^ - "bw -e rdm -v -T 1"^ - "bw -e rdm -v -T 1 -U"^ - "bw -e msg -v -T 1"^ + "flood -e rdm -v -T 1"^ + "flood -e rdm -v -T 1 -U"^ + "flood -e msg -v -T 1"^ "rdm_multi_client -C 10 -I 5"^ "rdm_multi_client -C 10 -I 5 -U" @@ -96,6 +94,8 @@ set short_tests=^ "rma_bw -e rdm -o read -I 5 -U"^ "rma_bw -e rdm -o writedata -I 5"^ "rma_bw -e rdm -o writedata -I 5 -U"^ + "rdm_atomic -I 5 -o all"^ + "rdm_atomic -I 5 -o all -v"^ "rdm_cntr_pingpong -I 5"^ "multi_recv -e rdm -I 5"^ "rdm_pingpong -I 5"^ @@ -127,6 +127,8 @@ set standard_tests=^ "rma_bw -e rdm -o read -U"^ "rma_bw -e rdm -o writedata"^ "rma_bw -e rdm -o writedata -U"^ + "rdm_atomic -o all"^ + "rdm_atomic -o all -v"^ "rdm_cntr_pingpong"^ "multi_recv -e rdm"^ "rdm_pingpong"^ diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 92f85482d9b..a6c3b075576 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -108,12 +108,9 @@ functional_tests=( "fi_cq_data -e rdm -o writedata" "fi_cq_data -e dgram -o writedata" "fi_dgram" - "fi_dgram_waitset" "fi_msg" "fi_msg_epoll" "fi_msg_sockets" - "fi_poll -t queue" - "fi_poll -t counter" "fi_rdm" "fi_rdm -U" "fi_rdm_rma_event" @@ -132,8 +129,20 @@ functional_tests=( "fi_rdm_shared_av" "fi_multi_mr -e msg -V" "fi_multi_mr -e rdm -V" - "fi_multi_ep -e msg -v --shared-av" - "fi_multi_ep -e rdm -v --shared-av" + "fi_multi_ep -e msg -v -A" + "fi_multi_ep -e rdm -v -A" + "fi_multi_ep -e msg -v -Q" + "fi_multi_ep -e rdm -v -Q" + "fi_multi_ep -e msg -v -A -Q" + "fi_multi_ep -e rdm -v -A -Q" + "fi_multi_ep -e msg -v --threading completion" + "fi_multi_ep -e rdm -v --threading completion" + "fi_multi_ep -e msg -v -A --threading completion" + "fi_multi_ep -e rdm -v -A --threading completion" + "fi_multi_ep -e msg -v -Q --threading completion" + "fi_multi_ep -e rdm -v -Q --threading completion" + "fi_multi_ep -e msg -v -A -Q --threading completion" + "fi_multi_ep -e rdm -v -A -Q --threading completion" "fi_recv_cancel -e rdm -V" "fi_unexpected_msg -e msg -I 10 -v" "fi_unexpected_msg -e rdm -I 10 -v" @@ -141,9 +150,9 @@ functional_tests=( "fi_inject_test -N -A inject -v" "fi_inject_test -A inj_complete -v" "fi_inject_test -N -A inj_complete -v" - "fi_bw -e rdm -v -T 1" - "fi_bw -e rdm -v -T 1 -U" - "fi_bw -e msg -v -T 1" + "fi_flood -e rdm -v -T 1" + "fi_flood -e rdm -v -T 1 -U" + "fi_flood -e msg -v -T 1" "fi_rdm_multi_client -C 10 -I 5" "fi_rdm_multi_client -C 10 -I 5 -U" ) @@ -164,6 +173,8 @@ short_tests=( "fi_rma_bw -e rdm -o writedata -I 5 -U" "fi_rdm_atomic -I 5 -o all" "fi_rdm_atomic -I 5 -o all -U" + "fi_rdm_atomic -I 5 -o all -v" + "fi_rdm_atomic -I 5 -o all -U -v" "fi_rdm_cntr_pingpong -I 5" "fi_multi_recv -e rdm -I 5" "fi_multi_recv -e msg -I 5" @@ -200,6 +211,8 @@ standard_tests=( "fi_rma_bw -e rdm -o writedata -U" "fi_rdm_atomic -o all -I 1000" "fi_rdm_atomic -o all -I 1000 -U" + "fi_rdm_atomic -o all -I 1000 -v" + "fi_rdm_atomic -o all -I 1000 -U -v" "fi_rdm_cntr_pingpong" "fi_multi_recv -e rdm" "fi_multi_recv -e msg" diff --git a/fabtests/scripts/runmultinode.py b/fabtests/scripts/runmultinode.py index a8c836f532e..b1749990dd1 100644 --- a/fabtests/scripts/runmultinode.py +++ b/fabtests/scripts/runmultinode.py @@ -6,7 +6,7 @@ def parse_args(): parser = argparse.ArgumentParser(description="libfabric multinode test with slurm") parser.add_argument('--dry-run', action='store_true', help='Perform a dry run without making any changes.') parser.add_argument("--ci", type=str, help="Commands to prepend to test call. Only used with the internal launcher option", default="") - parser.add_argument("-C", "--capability", type=str, help="libfabric capability", default="msg") + parser.add_argument("-x", "--capability", type=str, help="libfabric capability", default="msg") parser.add_argument("-i", "--iterations", type=int , help="Number of iterations", default=1) parser.add_argument("-l", "--launcher", type=str, choices=['internal', 'srun', 'mpirun'], help="launcher to use for running job. If nothing is specified, test manages processes internally. Available options: internal, srun and mpirun", default="internal") @@ -172,11 +172,11 @@ def is_srun_pm_supported(): if args.provider in no_addr_prov: cmd = f"fi_multinode -n {args.num_procs} -s {socket.gethostname()} " \ - f"-p {args.provider} -C {args.capability} -z {mnode['pattern']} " \ - f"-I {args.iterations} -u {args.launcher.lower()} -E -T" + f"-p {args.provider} -x {args.capability} -z {mnode['pattern']} " \ + f"-I {args.iterations} -u {args.launcher.lower()} -T" else: cmd = f"fi_multinode -n {args.num_procs} -s {socket.gethostname()} " \ - f"-p {args.provider} -C {args.capability} -z '{mnode['pattern']}' " \ + f"-p {args.provider} -x {args.capability} -z '{mnode['pattern']}' " \ f"-I {args.iterations} -u {args.launcher.lower()} -T" if args.launcher.lower() == 'mpirun': @@ -196,7 +196,7 @@ def is_srun_pm_supported(): exit() hl = ",".join(expand_host_list(os.environ['SLURM_NODELIST'])) mpi = f"runmultinode.sh -h {hl} -n {args.procs_per_node} -p {args.provider} " \ - f"-C {args.capability} -I {args.iterations} -z {mnode['pattern']}" + f"-x {args.capability} -I {args.iterations} -z {mnode['pattern']}" if args.ci: mpi += f" --ci '{args.ci}'" else: diff --git a/fabtests/scripts/runmultinode.sh b/fabtests/scripts/runmultinode.sh index d4491de48b8..ebe564d0a8e 100755 --- a/fabtests/scripts/runmultinode.sh +++ b/fabtests/scripts/runmultinode.sh @@ -1,7 +1,7 @@ #!/bin/bash -Options=$(getopt --options h:,n:,p:,I:,-x:,z: \ - --longoptions hosts:,processes-per-node:,provider:,xfer-method:,iterations:,ci:,cleanup,help \ +Options=$(getopt --options h:,n:,p:,I:,-x:-E:,z: \ + --longoptions hosts:,processes-per-node:,provider:,xfer-method:,env:,iterations:,ci:,cleanup,help \ -- "$@") eval set -- "$Options" @@ -10,7 +10,7 @@ hosts=[] ppn=1 iterations=1 pattern="" -xfer-method="msg" +xfer_method="msg" cleanup=false help=false ci="" @@ -19,7 +19,7 @@ while true; do case "$1" in -h|--hosts) IFS=',' read -r -a hosts <<< "$2"; shift 2 ;; - -n|--processes-per-node) + -n|--processes-per-node) ppn=$2; shift 2 ;; -p|--provider) provider="$2"; shift 2 ;; @@ -30,10 +30,17 @@ while true; do --cleanup) cleanup=true; shift ;; -x|--xfer-method) - xfer-method="$2"; shift 2 ;; + xfer_method="$2"; shift 2 ;; + -E|--env) + delimiter="=" + value=${2#*$delimiter} + var=${2:0:$(( ${#2} - ${#value} - ${#delimiter} ))} + EXPORT_STRING="export $var=\"$value\"" + EXPORT_ENV="${EXPORT_ENV}${EXPORT_STRING}; " + shift 2 ;; --ci) ci="$2"; shift 2 ;; - --help) + --help) help=true; shift ;; --) shift; break ;; @@ -41,21 +48,21 @@ while true; do done if $help ; then - echo "Run the multinode test suite on the nodes provided for many procceses" + echo "Run the multinode test suite on the nodes provided for many procceses" echo "multinode tests are run in performance mode" echo "Options" echo "\t-h,--hosts list of host names to run the tests on" - echo "\t-n,--processes-per-node number of processes to be run on each node.\ - Total number of fi_mulinode tests run will be n*number of hosts" + echo "\t-n,--processes-per-node number of processes to be run on each node. Total number of fi_mulinode tests run will be n*number of hosts" echo "\t-p,--provider libfabric provider to run the multinode tests on" - echo "\t-C,--cabability multinode cabability to use (rma or default: msg)" - echo "\t-I,-- iterations number of iterations for the multinode test \ + echo "\t-x,--xfer-method multinode transfer method/capability to use (rma or default: msg)" + echo "\t-E,--env export provided variable name and value" + echo "\t-I,--iterations number of iterations for the multinode test \ to run each pattern on" echo "\t--cleanup end straggling processes. Does not rerun tests" echo "\t--help show this message" exit 1 fi - + num_hosts=${#hosts[@]} max_ranks=$(($num_hosts*$ppn)) ranks=$max_ranks; @@ -65,7 +72,7 @@ output="multinode_server_${num_hosts}_${ppn}.log" ret=0 if ! $cleanup ; then - cmd="${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer-method $pattern -I $iterations -T" + cmd="${EXPORT_ENV} ${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer_method $pattern -I $iterations -T" echo $cmd for node in "${hosts[@]}"; do for i in $(seq 1 $ppn); do @@ -73,7 +80,7 @@ if ! $cleanup ; then echo STARTING SERVER if [ "$ci" == "" ]; then ssh $node $cmd &> $output & - else + else ssh $node $cmd | tee $output & fi server_pid=$! @@ -104,4 +111,4 @@ if ! $cleanup ; then echo "Output: $PWD/$output" fi -exit $ret +exit $ret diff --git a/fabtests/test_configs/efa/efa-neuron.exclude b/fabtests/test_configs/efa/efa-neuron.exclude index c5a8fd706c3..49aa4408e33 100644 --- a/fabtests/test_configs/efa/efa-neuron.exclude +++ b/fabtests/test_configs/efa/efa-neuron.exclude @@ -69,9 +69,6 @@ cmatose # shared AV isn't supported shared_av -# wait isn't supported -dgram_waitset - # Remove this once ubertest supports setting MR modes ubertest diff --git a/fabtests/test_configs/efa/efa.exclude b/fabtests/test_configs/efa/efa.exclude index 6743d1d3f77..6798f678936 100644 --- a/fabtests/test_configs/efa/efa.exclude +++ b/fabtests/test_configs/efa/efa.exclude @@ -74,9 +74,6 @@ cmatose # shared AV isn't supported shared_av -# wait isn't supported -dgram_waitset - # Remove this once ubertest supports setting MR modes ubertest diff --git a/fabtests/test_configs/ofi_rxm/tcp.test b/fabtests/test_configs/ofi_rxm/tcp.test index 6087f7ed588..f03c74e1a19 100644 --- a/fabtests/test_configs/ofi_rxm/tcp.test +++ b/fabtests/test_configs/ofi_rxm/tcp.test @@ -59,7 +59,6 @@ ], comp_type: [ FT_COMP_QUEUE, - FT_COMP_CNTR, ], mr_mode: [], progress: [ diff --git a/fabtests/test_configs/psm3/psm3.exclude b/fabtests/test_configs/psm3/psm3.exclude index 418ba8a1b5d..b2288415605 100644 --- a/fabtests/test_configs/psm3/psm3.exclude +++ b/fabtests/test_configs/psm3/psm3.exclude @@ -15,6 +15,5 @@ scalable_ep shared_av rdm_cntr_pingpong multi_recv -dgram_waitset multinode rdm_tagged_peek diff --git a/fabtests/test_configs/tcp/io_uring.exclude b/fabtests/test_configs/tcp/io_uring.exclude index d84ee2fde6a..9b7a5c73176 100644 --- a/fabtests/test_configs/tcp/io_uring.exclude +++ b/fabtests/test_configs/tcp/io_uring.exclude @@ -70,13 +70,14 @@ fi_msg_sockets # fi_unexpected_msg -e rdm fails with no message fi_unexpected_msg -e rdm -# fi_bw -e msg fails with +# fi_flood -e msg fails with # fi_eq_sread(): common/shared.c:1165, ret=-4 (Interrupted system call) -fi_bw -e msg +fi_flood -e msg -# fi_bw fails by hanging +# fi_flood fails by hanging +# fi_flood fails by runfabtest timeout only on the CI. # This is a suspected race condition -fi_bw +fi_flood # fi_msg_pingpong fails with # fi_eq_sread(): common/shared.c:1127, ret=-4 (Interrupted system call) diff --git a/fabtests/test_configs/tcp/tcp.exclude b/fabtests/test_configs/tcp/tcp.exclude index ec4b67d507c..63f13f99a24 100644 --- a/fabtests/test_configs/tcp/tcp.exclude +++ b/fabtests/test_configs/tcp/tcp.exclude @@ -6,8 +6,6 @@ atomic # dgram endpoints not supported dgram -multi_ep - # tests use counters, but counters not supported multi_mr rma_event diff --git a/fabtests/ubertest/test_ctrl.c b/fabtests/ubertest/test_ctrl.c index 4b4ee462813..30b43749499 100644 --- a/fabtests/ubertest/test_ctrl.c +++ b/fabtests/ubertest/test_ctrl.c @@ -870,9 +870,9 @@ static int ft_unit_atomic(void) ft_atom_ctrl.count = ft_tx_ctrl.rma_msg_size / ft_atom_ctrl.datatype_size; if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP || - ft_atom_ctrl.count > count || ft_atom_ctrl.count == 0) { + ft_atom_ctrl.count > count || ft_atom_ctrl.count == 0) return 0; - } + if (ret) return ret; @@ -1018,7 +1018,7 @@ void ft_cleanup(void) FT_CLOSE_FID(ft_atom_ctrl.comp_mr); ft_cleanup_xcontrol(&ft_rx_ctrl); ft_cleanup_xcontrol(&ft_tx_ctrl); - ft_free_host_tx_buf(); + ft_free_host_bufs(); ft_cleanup_mr_control(&ft_mr_ctrl); ft_cleanup_atomic_control(&ft_atom_ctrl); ft_cleanup_random(); diff --git a/fabtests/ubertest/verify.c b/fabtests/ubertest/verify.c index 1503d9ca2e3..ebedc8d4ed7 100644 --- a/fabtests/ubertest/verify.c +++ b/fabtests/ubertest/verify.c @@ -35,79 +35,14 @@ #include "ofi_atomic.h" #include "fabtest.h" -static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; -static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1; - -#define CHECK_LOCAL(res,local,cnt,ret,TYPE) \ - do { \ - int i; \ - TYPE *r = (res); \ - TYPE *l = (local); \ - for (i = 0; i < cnt; i++) { \ - if (r[i] != l[i]) { \ - ret = -FI_EIO; \ - break; \ - } \ - } \ - } while (0) \ - - -#define FT_FILL(dst,cnt,TYPE) \ - do { \ - int i, a = 0; \ - TYPE *d = (dst); \ - for (i = 0; i < cnt; i++) { \ - d[i] = (TYPE) (integ_alphabet[a]); \ - if (++a >= integ_alphabet_length) \ - a = 0; \ - } \ - } while (0) - -#ifdef HAVE___INT128 - -/* If __int128 supported, things just work. */ -#define FT_FILL_INT128(...) FT_FILL(__VA_ARGS__) -#define CHECK_LOCAL_INT128(...) CHECK_LOCAL(__VA_ARGS__) - -#else - -/* If __int128, we're not going to fill/verify. */ -#define FT_FILL_INT128(...) -#define CHECK_LOCAL_INT128(...) - -#endif - -#define SWITCH_TYPES(type,FUNC,...) \ - switch (type) { \ - case FI_INT8: FUNC(__VA_ARGS__,int8_t); break; \ - case FI_UINT8: FUNC(__VA_ARGS__,uint8_t); break; \ - case FI_INT16: FUNC(__VA_ARGS__,int16_t); break; \ - case FI_UINT16: FUNC(__VA_ARGS__,uint16_t); break; \ - case FI_INT32: FUNC(__VA_ARGS__,int32_t); break; \ - case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \ - case FI_INT64: FUNC(__VA_ARGS__,int64_t); break; \ - case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break; \ - case FI_INT128: FUNC##_INT128(__VA_ARGS__,ofi_int128_t); break; \ - case FI_UINT128: FUNC##_INT128(__VA_ARGS__,ofi_uint128_t); break; \ - case FI_FLOAT: FUNC(__VA_ARGS__,float); break; \ - case FI_DOUBLE: FUNC(__VA_ARGS__,double); break; \ - case FI_LONG_DOUBLE: FUNC(__VA_ARGS__,long_double); break; \ - case FI_FLOAT_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_float); break; \ - case FI_DOUBLE_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_double); break; \ - case FI_LONG_DOUBLE_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_long_double); break;\ - default: return -FI_EOPNOTSUPP; \ - } - int ft_sync_fill_bufs(size_t size) { int ret; ft_sock_sync(sock, 0); if (test_info.caps & FI_ATOMIC) { - SWITCH_TYPES(ft_atom_ctrl.datatype, FT_FILL, ft_tx_ctrl.buf, - ft_atom_ctrl.count); - SWITCH_TYPES(ft_atom_ctrl.datatype, FT_FILL, ft_mr_ctrl.buf, - ft_atom_ctrl.count); + (void)ft_fill_atomic(ft_tx_ctrl.buf, ft_atom_ctrl.count, ft_atom_ctrl.datatype); + (void)ft_fill_atomic(ft_mr_ctrl.buf, ft_atom_ctrl.count, ft_atom_ctrl.datatype); memcpy(ft_atom_ctrl.orig_buf, ft_mr_ctrl.buf, size); memcpy(ft_tx_ctrl.cpy_buf, ft_tx_ctrl.buf, size); } else if (is_read_func(test_info.class_function)) { @@ -131,67 +66,26 @@ int ft_sync_fill_bufs(size_t size) return 0; } -static int verify_atomic(void) -{ - int ret = 0; - void *dst, *src, *cmp, *tmp, *res; - enum fi_datatype type; - enum fi_op op; - size_t count; - - dst = ft_atom_ctrl.orig_buf; - src = ft_tx_ctrl.cpy_buf; - - cmp = ft_atom_ctrl.comp_buf; - tmp = ft_rx_ctrl.buf; - res = ft_atom_ctrl.res_buf; - - type = ft_atom_ctrl.datatype; - op = ft_atom_ctrl.op; - count = ft_atom_ctrl.count; - - /* - * If we don't have the test function, return > 0 to indicate - * verification is unsupported. - */ - if (is_compare_func(test_info.class_function)) { - if (!ofi_atomic_swap_handler(op, type)) - return 1; - } else if (is_fetch_func(test_info.class_function)) { - if (!ofi_atomic_readwrite_handler(op, type)) - return 1; - } else { - if (!ofi_atomic_write_handler(op, type)) - return 1; - } - - if (is_fetch_func(test_info.class_function) || - is_compare_func(test_info.class_function)) { - SWITCH_TYPES(type, CHECK_LOCAL, dst, res, count, ret); - if (ret) - return ret; - } - - if (is_compare_func(test_info.class_function)) { - ofi_atomic_swap_op(op, type, dst, src, cmp, tmp, count); - } else if (is_fetch_func(test_info.class_function)) { - ofi_atomic_readwrite_op(op, type, dst, src, tmp, count); - } else { - ofi_atomic_write_op(op, type, dst, src, count); - } - - SWITCH_TYPES(type, CHECK_LOCAL, dst, ft_mr_ctrl.buf, count, ret); - - return ret; -} - int ft_verify_bufs() { char *compare_buf; size_t compare_size; + enum ft_atomic_opcodes opcode; - if (test_info.caps & FI_ATOMIC) - return verify_atomic(); + if (test_info.caps & FI_ATOMIC) { + if (is_compare_func(test_info.class_function)) + opcode = FT_ATOMIC_COMPARE; + else if (is_fetch_func(test_info.class_function)) + opcode = FT_ATOMIC_FETCH; + else + opcode = FT_ATOMIC_BASE; + + return ft_check_atomic(opcode, ft_atom_ctrl.op, + ft_atom_ctrl.datatype, ft_tx_ctrl.cpy_buf, + ft_atom_ctrl.orig_buf, ft_mr_ctrl.buf, + ft_atom_ctrl.comp_buf, ft_atom_ctrl.res_buf, + ft_atom_ctrl.count); + } if (test_info.caps & FI_RMA) { compare_size = ft_tx_ctrl.rma_msg_size; diff --git a/fabtests/unit/av_test.c b/fabtests/unit/av_test.c index 3c4f06ce773..72da2313a39 100644 --- a/fabtests/unit/av_test.c +++ b/fabtests/unit/av_test.c @@ -718,7 +718,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->addr_format = FI_SOCKADDR; ret = fi_getinfo(FT_FIVERSION, opts.src_addr, 0, FI_SOURCE, hints, &fi); diff --git a/fabtests/unit/cntr_test.c b/fabtests/unit/cntr_test.c index 45306040f7b..f881ec2a452 100644 --- a/fabtests/unit/cntr_test.c +++ b/fabtests/unit/cntr_test.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/cq_test.c b/fabtests/unit/cq_test.c index a80fd16a415..32188e304d3 100644 --- a/fabtests/unit/cq_test.c +++ b/fabtests/unit/cq_test.c @@ -249,7 +249,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/dom_test.c b/fabtests/unit/dom_test.c index 7116b78e282..8f82abc5571 100644 --- a/fabtests/unit/dom_test.c +++ b/fabtests/unit/dom_test.c @@ -90,7 +90,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/eq_test.c b/fabtests/unit/eq_test.c index 80cfeb4a720..d5cbbedc114 100644 --- a/fabtests/unit/eq_test.c +++ b/fabtests/unit/eq_test.c @@ -611,7 +611,7 @@ int main(int argc, char **argv) hints->mode = FI_CONTEXT | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | FI_RX_CQ_DATA; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/getinfo_test.c b/fabtests/unit/getinfo_test.c index b888a942619..37518a94b92 100644 --- a/fabtests/unit/getinfo_test.c +++ b/fabtests/unit/getinfo_test.c @@ -538,53 +538,6 @@ static int init_invalid_rma_WAW_ordering_size(struct fi_info *hints) /* * MR mode checks */ -static int init_mr_basic(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_BASIC; - return 0; -} - -static int check_mr_basic(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_BASIC) ? - EXIT_FAILURE : 0; -} - -static int init_mr_scalable(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_SCALABLE; - return 0; -} - -static int check_mr_scalable(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_SCALABLE) ? - EXIT_FAILURE : 0; -} - -static int init_mr_unspec(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_UNSPEC; - return 0; -} - -static int test_mr_v1_0(char *node, char *service, uint64_t flags, - struct fi_info *test_hints, struct fi_info **info) -{ - return fi_getinfo(FI_VERSION(1, 0), node, service, flags, - test_hints, info); -} - -static int check_mr_unspec(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_BASIC && - info->domain_attr->mr_mode != FI_MR_SCALABLE) ? - EXIT_FAILURE : 0; -} - static int init_mr_mode(struct fi_info *hints, uint64_t mode) { hints->domain_attr->mr_mode = (uint32_t) mode; @@ -692,7 +645,7 @@ static int test_caps_regression(char *node, char *service, uint64_t flags, /* Limit mode bits to common, older options only */ hints->caps |= fi->caps; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP; fi_freeinfo(*info); @@ -906,18 +859,7 @@ getinfo_test(bad_waw_ordering, 1, "Test invalid rma WAW ordering size", NULL, NULL, -FI_ENODATA) /* MR mode tests */ -getinfo_test(mr_mode, 1, "Test FI_MR_BASIC", NULL, NULL, 0, - hints, init_mr_basic, NULL, check_mr_basic, -FI_ENODATA) -getinfo_test(mr_mode, 2, "Test FI_MR_SCALABLE", NULL, NULL, 0, - hints, init_mr_scalable, NULL, check_mr_scalable, -FI_ENODATA) -getinfo_test(mr_mode, 3, "Test FI_MR_UNSPEC (v1.0)", NULL, NULL, 0, - hints, init_mr_unspec, test_mr_v1_0, check_mr_unspec, -FI_ENODATA) -getinfo_test(mr_mode, 4, "Test FI_MR_BASIC (v1.0)", NULL, NULL, 0, - hints, init_mr_basic, test_mr_v1_0, check_mr_basic, -FI_ENODATA) -getinfo_test(mr_mode, 5, "Test FI_MR_SCALABLE (v1.0)", NULL, NULL, 0, - hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable, - -FI_ENODATA) -getinfo_test(mr_mode, 6, "Test mr_mode bits", NULL, NULL, 0, +getinfo_test(mr_mode, 1, "Test mr_mode bits", NULL, NULL, 0, hints, NULL, validate_mr_modes, NULL, 0) /* Progress tests */ @@ -1008,11 +950,6 @@ int main(int argc, char **argv) TEST_ENTRY_GETINFO(bad_waw_ordering1), TEST_ENTRY_GETINFO(neg1), TEST_ENTRY_GETINFO(mr_mode1), - TEST_ENTRY_GETINFO(mr_mode2), - TEST_ENTRY_GETINFO(mr_mode3), - TEST_ENTRY_GETINFO(mr_mode4), - TEST_ENTRY_GETINFO(mr_mode5), - TEST_ENTRY_GETINFO(mr_mode6), TEST_ENTRY_GETINFO(progress1), TEST_ENTRY_GETINFO(progress2), TEST_ENTRY_GETINFO(caps1), diff --git a/fabtests/unit/mr_cache_evict.c b/fabtests/unit/mr_cache_evict.c index a12f9c31372..4a3c16ac9a0 100644 --- a/fabtests/unit/mr_cache_evict.c +++ b/fabtests/unit/mr_cache_evict.c @@ -806,7 +806,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG | FI_RMA; if (opts.options & FT_OPT_ENABLE_HMEM) diff --git a/fabtests/unit/mr_test.c b/fabtests/unit/mr_test.c index d071a8d74f7..df4caf66992 100644 --- a/fabtests/unit/mr_test.c +++ b/fabtests/unit/mr_test.c @@ -324,7 +324,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE | FI_MR_LOCAL); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG | FI_RMA; if (opts.options & FT_OPT_ENABLE_HMEM) diff --git a/fabtests/unit/setopt_test.c b/fabtests/unit/setopt_test.c index 40487aaa3de..5f7b2ddc5be 100644 --- a/fabtests/unit/setopt_test.c +++ b/fabtests/unit/setopt_test.c @@ -152,7 +152,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG; failed = run_tests(test_array, err_buf); diff --git a/include/ofi.h b/include/ofi.h index 7592281c766..9661a7553d9 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -297,6 +297,7 @@ enum ofi_prov_type { OFI_PROV_UTIL, OFI_PROV_HOOK, OFI_PROV_OFFLOAD, + OFI_PROV_LNX, }; /* Restrict to size of struct fi_provider::context (struct fi_context) */ diff --git a/include/ofi_abi.h b/include/ofi_abi.h index 66f76e93909..e16a900f55a 100644 --- a/include/ofi_abi.h +++ b/include/ofi_abi.h @@ -46,31 +46,16 @@ extern "C" { /* * ABI version support definitions. * - * CURRENT_ABI: - * This defines the current ABI version. The ABI version is separate from - * the packaging or interface versions. Whenever a change is - * added to the interfaces that breaks the ABI, this definition should be - * updated. If you don't know if a change breaks the ABI, then you shouldn't - * be modifying the header files under include/rdma! :P - * * DEFAULT_SYMVER_PRE: * This macro appends an underscore to a function name. It should be used * around any function that is exported from the library as the default call * that applications invoke. * - * CURRENT_SYMVER: - * This macro is placed after a function definition. It should be used with - * any function that is exported by the library and was added as part of the - * current ABI (identified by CURRENT_ABI) version. It results in the function - * being exported at the current ABI version. This is the macro to use when - * exporting new functions. - * * DEFAULT_SYMVER: - * This macro is similar to CURRENT_SYMVER, but is used to specify that a - * function, while the default interface that applications call, was added - * in a previous version of the ABI. Any function that was not impacted by - * an ABI change should use this macro. This often means converting functions - * marked as CURRENT_SYMVER to DEFAULT_SYMVER as part of the ABI update. + * This macro is placed after a function definition. It should be used to + * specify that a function is the default interface that applications call + * and is/was added in the specified ABI version. Any function that is new + * or is not impacted by an ABI change should use this macro. * * COMPAT_SYMVER: * The compatibility symbols are used to mark interfaces which were exported @@ -83,21 +68,19 @@ extern "C" { * ABI version 1.1 modified the behavior for function foo(). * This scenario would result in the following definitions. * - * CURRENT_ABI "MYLIB_1.1" - * * This function is the main entry point for function bar. * int DEFAULT_SYMVER_PRE(bar)(void) * { * ... * } - * DEFAULT_SYMVER(bar_, bar, MYLIB_1.0); + * DEFAULT_SYMVER(bar_, bar, FABRIC_1.0); * * This function is the main entry point for function foo. * int DEFAULT_SYMVER_PRE(foo)(void) * { * ... * } - * CURRENT_SYMVER(foo_, foo); + * DEFAULT_SYMVER(foo_, foo, FABRIC_1.1); * * This function is the old entry point for function foo, provided for * backwards compatibility. @@ -105,14 +88,12 @@ extern "C" { * { * ... * } - * COMPAT_SYMVER(foo_1_0, foo, MYLIB_1.0); + * COMPAT_SYMVER(foo_1_0, foo, FABRIC_1.0); * * By convention, the name of compatibility functions is the exported function * name appended with the ABI version that it is compatible with. */ -#define CURRENT_ABI "FABRIC_1.7" - #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER_PRE(a) a##_ #else @@ -126,8 +107,6 @@ extern "C" { asm(".symver " #name "," #api "@" #ver "\n") #define DEFAULT_SYMVER(name, api, ver) \ asm(".symver " #name "," #api "@@" #ver "\n") -#define CURRENT_SYMVER(name, api) \ - asm(".symver " #name "," #api "@@" CURRENT_ABI "\n") #else @@ -136,11 +115,8 @@ extern "C" { #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER(name, api, ver) \ extern typeof (name) api __attribute__((alias(#name))); -#define CURRENT_SYMVER(name, api) \ - extern typeof (name) api __attribute__((alias(#name))); #else #define DEFAULT_SYMVER(name, api, ver) -#define CURRENT_SYMVER(name, api) #endif /* HAVE_ALIAS_ATTRIBUTE == 1*/ #endif /* HAVE_SYMVER_SUPPORT */ diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h index 9db6d94cd70..72fab37a650 100644 --- a/include/ofi_hmem.h +++ b/include/ofi_hmem.h @@ -131,6 +131,7 @@ struct ofi_hmem_ops { const void *src, size_t size); int (*get_dmabuf_fd)(const void *addr, uint64_t size, int *fd, uint64_t *offset); + int (*put_dmabuf_fd)(int fd); }; extern struct ofi_hmem_ops hmem_ops[]; @@ -167,6 +168,7 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src, size_t size); int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, uint64_t *offset); +int rocr_hmem_put_dmabuf_fd(int fd); int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size); int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size); @@ -193,6 +195,7 @@ bool cuda_is_gdrcopy_enabled(void); bool cuda_is_dmabuf_supported(void); int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, uint64_t *offset); +int cuda_put_dmabuf_fd(int fd); void cuda_gdrcopy_to_dev(uint64_t handle, void *dev, const void *host, size_t size); @@ -357,6 +360,11 @@ static inline int ofi_hmem_no_get_dmabuf_fd(const void *addr, uint64_t size, return -FI_ENOSYS; } +static inline int ofi_hmem_no_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + static inline bool ofi_hmem_p2p_disabled(void) { return ofi_hmem_disable_p2p; @@ -450,5 +458,6 @@ int ofi_hmem_dev_reg_copy_from_hmem(enum fi_hmem_iface iface, uint64_t handle, void *dest, const void *src, size_t size); int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface, const void *addr, uint64_t size, int *fd, uint64_t *offset); +int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd); #endif /* _OFI_HMEM_H_ */ diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 1ebb07a8e11..b0556eee019 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -2,7 +2,7 @@ * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,8 @@ # include #endif /* HAVE_CONFIG_H */ +struct ofi_mr; + #include #include @@ -48,6 +50,15 @@ #include #include #include +#include + +#if HAVE_KDREG2_MONITOR +#if HAVE_KDREG2_INCLUDE_PATH +#include "kdreg2.h" +#else +#include +#endif +#endif int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len, uint64_t flags, struct fid **fid, void *context); @@ -107,9 +118,12 @@ static inline uint64_t ofi_mr_get_prov_mode(uint32_t version, } } - /* Single lock used by all memory monitors and MR caches. */ extern pthread_mutex_t mm_lock; + +/* Lock used to coordinate monitor states. */ +extern pthread_mutex_t mm_state_lock; + /* The read-write lock is an additional lock used to protect the dlist_entry * list of ofi_mem_monitor. Due to the necessity of releasing the mm_lock * while walking the dlist in ofi_monitor_notify, we need a separate lock to @@ -128,6 +142,12 @@ struct ofi_mr_cache; union ofi_mr_hmem_info { uint64_t cuda_id; uint64_t ze_id; +#if HAVE_KDREG2_MONITOR + struct { + kdreg2_cookie_t cookie; + struct kdreg2_monitoring_params monitoring_params; + } kdreg2; +#endif }; struct ofi_mr_entry { @@ -214,6 +234,7 @@ struct ofi_uffd { struct ofi_mem_monitor monitor; pthread_t thread; int fd; + int exit_pipe[2]; }; extern struct ofi_mem_monitor *uffd_monitor; @@ -228,6 +249,23 @@ struct ofi_memhooks { extern struct ofi_mem_monitor *memhooks_monitor; +/* + * Kdreg2 monitor + */ + +struct kdreg2_status_data; + +struct ofi_kdreg2 { + struct ofi_mem_monitor monitor; + pthread_t thread; + int fd; + int exit_pipe[2]; + const struct kdreg2_status_data *status_data; + ofi_atomic64_t next_cookie; +}; + +extern struct ofi_mem_monitor *kdreg2_monitor; + extern struct ofi_mem_monitor *cuda_monitor; extern struct ofi_mem_monitor *cuda_ipc_monitor; extern struct ofi_mem_monitor *rocr_monitor; @@ -368,7 +406,7 @@ struct ofi_mr_cache { struct ofi_rbmap tree; struct dlist_entry lru_list; struct dlist_entry dead_region_list; - pthread_mutex_t lock; + pthread_mutex_t lock; size_t cached_cnt; size_t cached_size; @@ -417,14 +455,15 @@ bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru); * a new ofi_mr_entry and assign it to entry. * * @param[in] cache The cache the entry belongs to - * @param[in] info Information about the mr entry to search + * @param[in out] info Information about the mr entry to search. Info IOV may + * be updated by providers to reflect region registered by + * the provider. * @param[out] entry The registered entry corresponding to the * region described in info. * @returns On success, returns 0. On failure, returns a negative error code. */ -int ofi_mr_cache_search(struct ofi_mr_cache *cache, - const struct ofi_mr_info *info, - struct ofi_mr_entry **entry); +int ofi_mr_cache_search(struct ofi_mr_cache *cache, struct ofi_mr_info *info, + struct ofi_mr_entry **entry); /** * Given an attr (with an iov range), if the iov range is already registered, diff --git a/include/ofi_net.h b/include/ofi_net.h index 1eeaea980d7..c9f4df00774 100644 --- a/include/ofi_net.h +++ b/include/ofi_net.h @@ -903,14 +903,14 @@ uint32_t ofi_addr_format(const char *str); int ofi_str_toaddr(const char *str, uint32_t *addr_format, void **addr, size_t *len); -void ofi_straddr_log_internal(const char *func, int line, +void ofi_straddr_log_internal(const char *func, int line, uint32_t addr_format, const struct fi_provider *prov, enum fi_log_level level, enum fi_log_subsys subsys, char *log_str, const void *addr); #define ofi_straddr_log(...) \ - ofi_straddr_log_internal(__func__, __LINE__, __VA_ARGS__) + ofi_straddr_log_internal(__func__, __LINE__, FI_FORMAT_UNSPEC, __VA_ARGS__) #if ENABLE_DEBUG #define ofi_straddr_dbg(prov, subsystem, ...) \ diff --git a/include/ofi_prov.h b/include/ofi_prov.h index ccb3fbf616d..7ffcda76268 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -211,6 +211,17 @@ MRAIL_INI ; # define MRAIL_INIT NULL #endif +#if (HAVE_LNX) && (HAVE_LNX_DL) +# define LNX_INI FI_EXT_INI +# define LNX_INIT NULL +#elif (HAVE_LNX) +# define LNX_INI INI_SIG(fi_lnx_ini) +# define LNX_INIT fi_lnx_ini() +LNX_INI ; +#else +# define LNX_INIT NULL +#endif + #if (HAVE_PERF) && (HAVE_PERF_DL) # define HOOK_PERF_INI FI_EXT_INI # define HOOK_PERF_INIT NULL diff --git a/include/ofi_util.h b/include/ofi_util.h index e90b09ea058..bc590bb4d1a 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -835,6 +835,10 @@ static inline void ofi_ep_peer_rx_cntr_incerr(struct util_ep *ep, uint8_t op) * AV / addressing */ +#define ofi_av_straddr_log(av, level, ...) \ + ofi_straddr_log_internal(__func__, __LINE__, av->domain->addr_format, \ + av->prov, level, FI_LOG_AV, __VA_ARGS__) + struct util_av; struct util_av_set; struct util_peer_addr; @@ -951,12 +955,15 @@ struct rxm_av { struct fid_peer_av peer_av; struct fid_av *util_coll_av; struct fid_av *offload_coll_av; + void (*foreach_ep)(struct util_av *av, struct util_ep *util_ep); }; int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)); + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, + struct util_ep *ep)); size_t rxm_av_max_peers(struct rxm_av *av); void rxm_ref_peer(struct util_peer_addr *peer); void *rxm_av_alloc_conn(struct rxm_av *av); @@ -1168,9 +1175,11 @@ void ofi_fabric_remove(struct util_fabric *fabric); * Utility Providers */ -#define OFI_NAME_DELIM ';' +#define OFI_NAME_LNX_DELIM ':' +#define OFI_NAME_DELIM ';' #define OFI_UTIL_PREFIX "ofi_" #define OFI_OFFLOAD_PREFIX "off_" +#define OFI_LNX "lnx" static inline int ofi_has_util_prefix(const char *str) { @@ -1182,6 +1191,16 @@ static inline int ofi_has_offload_prefix(const char *str) return !strncasecmp(str, OFI_OFFLOAD_PREFIX, strlen(OFI_OFFLOAD_PREFIX)); } +static inline int ofi_is_lnx(const char *str) +{ + return !strncasecmp(str, OFI_LNX, strlen(OFI_LNX)); +} + +static inline int ofi_is_linked(const char *str) +{ + return (strcasestr(str, OFI_LNX)) ? 1 : 0; +} + int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, const struct fi_info *util_hints, @@ -1197,6 +1216,7 @@ int ofi_get_core_info_fabric(const struct fi_provider *prov, struct fi_info **core_info); +char *ofi_strdup_link_append(const char *head, const char *tail); char *ofi_strdup_append(const char *head, const char *tail); // char *ofi_strdup_head(const char *str); // char *ofi_strdup_tail(const char *str); diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 420d2eacc05..42c50532797 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -65,7 +65,6 @@ #if defined(_WIN32) #include -#include typedef SSIZE_T ssize_t; #endif @@ -158,14 +157,15 @@ typedef struct fid *fid_t; #define FI_MATCH_COMPLETE (1ULL << 31) #define FI_PEER_TRANSFER (1ULL << 36) -#define FI_MR_DMABUF (1ULL << 40) +/* #define FI_MR_DMABUF (1ULL << 40) */ #define FI_AV_USER_ID (1ULL << 41) #define FI_PEER (1ULL << 43) /* #define FI_XPU_TRIGGER (1ULL << 44) */ -#define FI_HMEM_HOST_ALLOC (1ULL << 45) -#define FI_HMEM_DEVICE_ONLY (1ULL << 46) + +#define FI_TAGGED_DIRECTED_RECV (1ULL << 45) +#define FI_TAGGED_MULTI_RECV (1ULL << 46) #define FI_HMEM (1ULL << 47) -/* #define FI_VARIABLE_MSG (1ULL << 48) */ +#define FI_EXACT_DIRECTED_RECV (1ULL << 48) #define FI_RMA_PMEM (1ULL << 49) #define FI_SOURCE_ERR (1ULL << 50) #define FI_LOCAL_COMM (1ULL << 51) @@ -339,6 +339,7 @@ enum { FI_PROTO_SM2, FI_PROTO_CXI_RNR, FI_PROTO_LPP, + FI_PROTO_LNX, }; enum { diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 548e4b6ad3e..0e6d0acb605 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -122,6 +122,12 @@ struct fid_av { * Tracks registered memory regions, primarily for remote access, * but also for local access until we can remove that need. */ + +#define FI_MR_DMABUF (1ULL << 40) +#define FI_MR_SINGLE_USE (1ULL << 41) +#define FI_HMEM_HOST_ALLOC (1ULL << 45) +#define FI_HMEM_DEVICE_ONLY (1ULL << 46) + struct fid_mr { struct fid fid; void *mem_desc; @@ -176,6 +182,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_modify { diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h index f5af121ec79..b90dbd5f42d 100644 --- a/include/rdma/fi_errno.h +++ b/include/rdma/fi_errno.h @@ -114,7 +114,7 @@ extern "C" { //#define FI_EADV EADV /* Advertise error */ //#define FI_ESRMNT ESRMNT /* Srmount error */ //#define FI_ECOMM ECOMM /* Communication error on send */ -//#define FI_EPROTO EPROTO /* Protocol error */ +#define FI_EPROTO EPROTO /* Protocol error */ //#define FI_EMULTIHOP EMULTIHOP /* Multihop attempted */ //#define FI_EDOTDOT EDOTDOT /* RFS specific error */ //#define FI_EBADMSG EBADMSG /* Not a data message */ diff --git a/include/windows/config.h b/include/windows/config.h index 920f06b9ca5..b3676930873 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -256,7 +256,7 @@ #define PACKAGE_TARNAME PACKAGE /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0.0b1" +#define PACKAGE_VERSION "2.1.0a1" /* Define to the full name and version of this package. */ #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION diff --git a/include/windows/osd.h b/include/windows/osd.h index 88b1754979e..efd3bf0d125 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -930,11 +930,6 @@ static inline char *strcasestr(const char *haystack, const char *needle) return pos; } -static inline char *strtok_r(char *str, const char *delimiters, char **saveptr) -{ - return strtok_s(str, delimiters, saveptr); -} - #ifndef _SC_PAGESIZE #define _SC_PAGESIZE 0 #endif @@ -1054,12 +1049,16 @@ static inline ofi_complex_## type ofi_complex_prod_## type \ res.imag = a.real * b.imag + a.imag * b.real; \ return res; \ } \ +static bool ofi_complex_is_true_## type (ofi_complex_ ## type a)\ +{ \ + return a.real != 0 || a.imag != 0; \ +} \ static inline ofi_complex_## type ofi_complex_land_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)(((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0))); \ + res.real = (type)(ofi_complex_is_true_## type (a) && \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } \ @@ -1067,8 +1066,8 @@ static inline ofi_complex_## type ofi_complex_lor_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)(((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0))); \ + res.real = (type)(ofi_complex_is_true_## type (a) || \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } \ @@ -1076,10 +1075,10 @@ static inline ofi_complex_## type ofi_complex_lxor_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)((((a.real != 0) || (a.imag != 0)) && \ - !((b.real != 0) || (b.imag != 0))) || \ - (!((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0)))); \ + res.real = (type)((ofi_complex_is_true_## type (a) && \ + !ofi_complex_is_true_## type (b))) || \ + (!ofi_complex_is_true_## type (a) && \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 1bc35fb93b5..2921b8316ca 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -759,6 +759,9 @@ + + + @@ -883,10 +886,10 @@ - - - - + + + + @@ -1010,7 +1013,6 @@ - diff --git a/man/fabric.7.md b/man/fabric.7.md index 789dd1ef2f7..a25190a46b0 100644 --- a/man/fabric.7.md +++ b/man/fabric.7.md @@ -447,6 +447,14 @@ attributes: *fi_domain_attr* : Added max_ep_auth_key +## ABI 1.8 + +ABI version starting with libfabric 2.0. Added new fi_fabric2 API call. +Added new fields to the following attributes: + +*fi_domain_attr* +: Added max_group_id + # SEE ALSO [`fi_info`(1)](fi_info.1.html), diff --git a/man/fi_av.3.md b/man/fi_av.3.md index 006ce3f9d73..7aeba1802ea 100644 --- a/man/fi_av.3.md +++ b/man/fi_av.3.md @@ -384,8 +384,9 @@ Upon successful insert with FI_AUTH_KEY flag, the returned fi_addr_t's will map endpoint address against the specified authorization keys. These fi_addr_t's can be used as the target for local data transfer operations. -If the endpoint supports `FI_DIRECTED_RECV`, these fi_addr_t's can be used to -restrict receive buffers to a specific endpoint address and authorization key. +If the endpoint supports `FI_DIRECTED_RECV` or `FI_TAGGED_DIRECTED_RECV`, these +fi_addr_t's can be used to restrict receive buffers to a specific endpoint address +and authorization key. For address vectors configured with FI_AV_USER_ID, all subsequent target events corresponding to the address being inserted will return FI_ADDR_NOTAVAIL until diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 0a32850cee8..0109d19e9ed 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -80,7 +80,9 @@ The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading models. The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object types. FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve -the lowest latency and reduce interrupt overhead. +the lowest latency and reduce interrupt overhead. NOTE: A process may return +from a epoll_wait/poll when provider progress is required and a CQ event may +not be available. ## Additional Features @@ -380,39 +382,24 @@ increase Request buffer space using the variables *FI_CXI_REQ_\**. ## Message Ordering -The CXI provider supports the following ordering rules: +Supported message ordering: FI_ORDER_SAS, FI_ORDER_WAW, FI_ORDER_RMA_WAW, +FI_ORDER_RMA_RAR, FI_ORDER_ATOMIC_WAW, and FI_ORDER_ATOMIC_RAR. -* All message Send operations are always ordered. -* RMA Writes may be ordered by specifying *FI_ORDER_RMA_WAW*. -* AMOs may be ordered by specifying *FI_ORDER_AMO_{WAW|WAR|RAW|RAR}*. -* RMA Writes may be ordered with respect to AMOs by specifying *FI_ORDER_WAW*. - Fetching AMOs may be used to perform short reads that are ordered with - respect to RMA Writes. +Note: Any FI_ORDER_*_{WAR,RAW} are not supported. + +Note: Relaxing the message ordering may result in improved performance. + +## Target Ordering Ordered RMA size limits are set as follows: -* *max_order_waw_size* is -1. RMA Writes and non-fetching AMOs of any size are - ordered with respect to each other. -* *max_order_raw_size* is -1. Fetching AMOs of any size are ordered with - respect to RMA Writes and non-fetching AMOs. -* *max_order_war_size* is -1. RMA Writes and non-fetching AMOs of any size are - ordered with respect to fetching AMOs. - -## PCIe Ordering - -Generally, PCIe writes are strictly ordered. As an optimization, PCIe TLPs may -have the Relaxed Order (RO) bit set to allow writes to be reordered. Cassini -sets the RO bit in PCIe TLPs when possible. Cassini sets PCIe RO as follows: - -* Ordering of messaging operations is established using completion events. - Therefore, all PCIe TLPs related to two-sided message payloads will have RO - set. -* Every PCIe TLP associated with an unordered RMA or AMO operation will have RO - cleared. -* PCIe TLPs associated with the last packet of an ordered RMA or AMO operation - will have RO cleared. -* PCIe TLPs associated with the body packets (all except the last packet of an - operation) of an ordered RMA operation will have RO set. +* *max_order_waw_size* is -1. RMA Writes and AMO writes of any size are ordered with + respect to each other. + +Note: Due to FI_ORDER_\*\_{WAR,RAW} not being supported, max_order_{raw,war}_size +are forced to zero. + +Note: Relaxing the target ordering may result in improved performance. ## Translation @@ -445,14 +432,14 @@ faults but requires all buffers to be backed by physical memory. Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. -The CXI provider supports DMABUF for device memory registration. If the ROCR -and CUDA libraries support it, the CXI provider will default to use DMA-buf. +The CXI provider supports DMABUF for device memory registration. +DMABUF is supported in ROCm 5.6+ and Cuda 11.7+ with nvidia open source driver +525+. +Both *FI_HMEM_ROCR_USE_DMABUF* and *FI_HMEM_CUDA_USE_DMABUF are disabled by +default in libfabric core but the CXI provider enables +*FI_HMEM_ROCR_USE_DMABUF* by default if not specifically set. There may be situations with CUDA that may double the BAR consumption. -Until this is fixed in the CUDA stack, the environment variable -*FI_CXI_DISABLE_DMABUF_CUDA* can be used to fall back to the nvidia -peer-memory interface. -Also, *FI_CXI_DISABLE_DMABUF_ROCR* can be used to fall back to the amdgpu -peer-memory interface. +Until this is fixed in the CUDA stack, CUDA DMABUF will be disabled by default. ## Translation Cache @@ -974,6 +961,10 @@ offloading are met. The CXI provider checks for the following environment variables: +*FI_CXI_MR_TARGET_ORDERING* +: MR target ordering (i.e. PCI ordering). Options: default, strict, or relaxed. + Recommendation is to leave at default behavior. + *FI_CXI_ODP* : Enables on-demand paging. If disabled, all DMA buffers are pinned. If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED, @@ -1294,6 +1285,17 @@ The CXI provider checks for the following environment variables: : Enable enforcement of triggered operation limit. Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance. +*FI_CXI_MR_CACHE_EVENTS_DISABLE_POLL_NSECS* +: Max amount of time to poll when disabling an MR configured with MR match events. + +*FI_CXI_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS* +: Max amount of time to poll when LE invalidate disabling an MR configured with MR + match events. + +*FI_CXI_FORCE_DEV_REG_COPY* +: Force the CXI provider to use the HMEM device register copy routines. If not + supported, RDMA operations or memory registration will fail. + Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e @@ -1373,10 +1375,9 @@ struct fi_cxi_dom_ops { }; ``` -*cntr_read* extension is used to read hardware counter values. Valid values -of the cntr argument are found in the Cassini-specific header file -cassini_cntr_defs.h. Note that Counter accesses by applications may be -rate-limited to 1HZ. +*cntr_read* extension is used to read Cassini Telemetry items that consists of +counters and gauges. The items available and their content are dependent upon +the Cassini ASIC version and Cassini Driver version. *topology* extension is used to return CXI NIC address topology information for the domain. Currently only a dragonfly fabric topology is reported. @@ -1523,11 +1524,6 @@ if (ret) error; ``` -When an endpoint does not support FI_FENCE (e.g. optimized MR), a provider -specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on an alias EP -to issue a FENCE operation to create a data ordering point for the alias. -This is supported for one-sided operations only. - Alias EP must be closed prior to closing the original EP. ## PCIe Atomics @@ -1578,7 +1574,7 @@ To enable PCIe fetch add for libfabric, the following CXI driver kernel module parameter must be set to non-zero. ``` -/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +/sys/module/cxi_ss1/parameters/amo_remap_to_pcie_fadd ``` The following are the possible values for this kernel module and the impact of diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index 02ef1d80b73..fe62d820b1e 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -39,7 +39,9 @@ The following features are supported: message size of the MTU of the underlying hardware (approximately 8 KiB). *Address vectors* -: The provider supports *FI_AV_TABLE* and *FI_AV_MAP* address vector types. +: The provider supports *FI_AV_TABLE*. *FI_AV_MAP* was deprecated in Libfabric 2.x. + Applications can still use *FI_AV_MAP* to create an address vector. But the EFA + provider implementation will print a warning and switch to *FI_AV_TABLE*. *FI_EVENT* is unsupported. *Completion events* @@ -113,7 +115,8 @@ provider for AWS Neuron or Habana SynapseAI. these operations are assisted by hardware support (return value is false). *FI_OPT_EFA_USE_DEVICE_RDMA - bool* -: Only available if the application selects a libfabric API version >= 1.18. +: These option only applies to the fi_setopt() call. + Only available if the application selects a libfabric API version >= 1.18. This option allows an application to change libfabric's behavior with respect to RDMA transfers. Note that there is also an environment variable FI_EFA_USE_DEVICE_RDMA which the user may set as well. If the @@ -129,7 +132,8 @@ provider for AWS Neuron or Habana SynapseAI. revisions. *FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES - bool* -: It is used to force the endpoint to use in-order send/recv operation for each 128 bytes +: These option only applies to the fi_setopt() call. + It is used to force the endpoint to use in-order send/recv operation for each 128 bytes aligned block. Enabling the option will guarantee data inside each 128 bytes aligned block being sent and received in order, it will also guarantee data to be delivered to the receive buffer only once. If endpoint is not able to @@ -137,7 +141,8 @@ provider for AWS Neuron or Habana SynapseAI. *FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES - bool* -: It is used to set the endpoint to use in-order RDMA write operation for each 128 bytes +: These option only applies to the fi_setopt() call.. + It is used to set the endpoint to use in-order RDMA write operation for each 128 bytes aligned block. Enabling the option will guarantee data inside each 128 bytes aligned block being written in order, it will also guarantee data to be delivered to the target buffer only once. If endpoint is not able to support @@ -205,6 +210,8 @@ struct fi_efa_mr_attr { **query_mr()** returns 0 on success, or the value of errno on failure (which indicates the failure reason). +# Traffic Class (tclass) in EFA +To prioritize the messages from a given endpoint, user can specify `fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY` in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. # RUNTIME PARAMETERS @@ -336,6 +343,11 @@ for details. : Use device's unsolicited write recv functionality when it's available. (Default: 1). Setting this environment variable to 0 can disable this feature. +*FI_EFA_INTERNAL_RX_REFILL_THRESHOLD* +: The threshold that EFA provider will refill the internal rx pkt pool. (Default: 8). +When the number of internal rx pkts to post is lower than this threshold, +the refill will be skipped. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index 146d9c7fcc9..e74049c2857 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -537,7 +537,8 @@ The following option levels and option names and parameters are defined. All providers that support FI_HMEM capability implement this option. - *FI_OPT_SHARED_MEMORY_PERMITTED - bool* -: This option controls the use of shared memory for intra-node communication. +: This option only applies to the fi_setopt call. + This option controls the use of shared memory for intra-node communication. Setting it to true will allow the use of shared memory. When set to false, shared memory will not be used and the implementation of intra-node communication is provider dependent. @@ -1340,6 +1341,7 @@ capability bits from the fi_info structure will be used. The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index 6219792257e..1f5b6e8b5ae 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -290,6 +290,17 @@ additional optimizations. capability is not set, then the src_addr parameter for msg and tagged receive operations is ignored. +*FI_TAGGED_DIRECTED_RECV* +: Similar to FI_DIRECTED_RECV, but only applies to tagged receive + operations. + +*FI_EXACT_DIRECTED_RECV* +: Similar to FI_DIRECTED_RECV, but requires the source address to be + exact, i.e., FI_ADDR_UNSPEC is not allowed. This capability can + be used alone, or in conjunction with FI_DIRECTED_RECV or + FI_TAGGED_DIRECTED_RECV as a modifier to disallow FI_ADDR_UNSPEC + being used as the source address. + *FI_FENCE* : Indicates that the endpoint support the FI_FENCE flag on data transfer operations. Support requires tracking that all previous @@ -333,6 +344,10 @@ additional optimizations. : Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. +*FI_TAGGED_MULTI_RECV* +: Specifies that the endpoint must support the FI_MULTI_RECV flag when + posting tagged receive buffers. + *FI_NAMED_RX_CTX* : Requests that endpoints which support multiple receive contexts allow an initiator to target (or name) a specific receive context as @@ -462,14 +477,15 @@ may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_MULTICAST, -FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, FI_XPU, -FI_AV_USER_ID, FI_PEER +FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_TAGGED_DIRECTED_RECV, FI_HMEM, +FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID, FI_PEER Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE -Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SHARED_AV, -FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. +Secondary capabilities: FI_MULTI_RECV, FI_TAGGED_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, +FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. # MODE diff --git a/man/fi_lnx.7.md b/man/fi_lnx.7.md new file mode 100644 index 00000000000..6d83e914c34 --- /dev/null +++ b/man/fi_lnx.7.md @@ -0,0 +1,157 @@ +--- +layout: page +title: fi_lnx(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_lnx \- The LINKx (LNX) Provider + +# OVERVIEW + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +# SUPPORTED FEATURES + +This release contains an initial implementation of the LNX provider that +offers the following support: + +*Endpoint types* +: The provider supports only endpoint type *FI_EP_RDM*. + +*Endpoint capabilities* +: LNX is a passthrough layer on the send path. On the receive path LNX + utilizes the peer infrastructure to create shared receive queues (SRQ). + Receive requests are placed on the SRQ instead of on the core provider + receive queue. When the provider receives a message it queries the SRQ for + a match. If one is found the receive request is completed, otherwise the + message is placed on the LNX shared unexpected queue (SUQ). Further receive + requests query the SUQ for matches. + The first release of the provider only supports tagged and RMA operations. + Other message types will be supported in future releases. + +*Modes* +: The provider does not require the use of any mode bits. + +*Progress* +: LNX utilizes the peer infrastructure to provide a shared completion + queue. Each linked provider still needs to handle its own progress. + Completion events will however be placed on the shared completion queue, + which is passed to the application for access. + +*Address Format* +: LNX wraps the linked providers addresses in one common binary blob. + It does not alter or change the linked providers address format. It wraps + them into a LNX structure which is then flattened and returned to the + application. This is passed between different nodes. The LNX provider + is able to parse the flattened format and operate on the different links. + This assumes that nodes in the same group are all using the same version of + the provider with the exact same links. IE: you can't have one node linking + SHM+CXI while another linking SHM+RXM. + +*Message Operations* +: LNX is designed to intercept message operations such as fi_tsenddata + and based on specific criteria forward the operation to the appropriate + provider. For the first release, LNX will only support linking SHM + provider for intra-node traffic and another provider (ex: CXI) for inter + node traffic. LNX send operation looks at the destination and based on + whether the destination is local or remote it will select the provider to + forward the operation to. The receive case has been described earlier. + +*Using the Provider* +: In order to use the provider the user needs to set FI_LNX_PROV_LINKS + environment variable to the linked providers in the following format + shm+. This will allow LNX to report back to the application in the + fi_getinfo() call the different links which can be selected. Since there are + multiple domains per provider LNX reports a permutation of all the + possible links. For example if there are two CXI interfaces on the machine + LNX will report back shm+cxi0 and shm+cxi1. The application can then + select based on its own criteria the link it wishes to use. + The application typically uses the PCI information in the fi_info + structure to select the interface to use. A common selection criteria is + the interface nearest the core the process is bound to. In order to make + this determination, the application requires the PCI information about the + interface. For this reason LNX forwards the PCI information for the + inter-node provider in the link to the application. + +# LIMITATIONS AND FUTURE WORK + +*Hardware Support* +: LNX doesn't support hardware offload; ex hardware tag matching. This is + an inherit limitation when using the peer infrastructure. Due to the use + of a shared receive queue which linked providers need to query when + a message is received, any hardware offload which requires sending the + receive buffers to the hardware directly will not work with the shared + receive queue. The shared receive queue provides two advantages; 1) reduce + memory usage, 2) coordinate the receive operations. For #2 this is needed + when receiving from FI_ADDR_UNSPEC. In this case both providers which are + part of the link can race to gain access to the receive buffer. It is + a future effort to determine a way to use hardware tag matching and other + hardware offload capability with LNX + +*Limited Linking* +: This release of the provider supports linking SHM provider for intra-node + operations and another provider which supports the FI_PEER capability for + inter-node operations. It is a future effort to expand to link any + multiple sets of providers. + +*Memory Registration* +: As part of the memory registration operation, varying hardware can perform + hardware specific steps such as memory pinning. Due to the fact that + memory registration APIs do not specify the source or destination + addresses it is not possible for LNX to determine which provider to + forward the memory registration to. LNX, therefore, registers the memory + with all linked providers. This might not be efficient and might have + unforeseen side effects. A better method is needed to support memory + registration. One option is to have memory registration cache in lnx + to avoid expensive operations. + +*Operation Types* +: This release of LNX supports tagged and RMA operations only. Future + releases will expand the support to other operation types. + +*Multi-Rail* +: Future design effort is being planned to support utilizing multiple interfaces + for traffic simultaneously. This can be over homogeneous interfaces or over + heterogeneous interfaces. + +# RUNTIME PARAMETERS + +The *LNX* provider checks for the following environment variables: + +*FI_LNX_PROV_LINKS* +: This environment variable is used to specify which providers to link. This + must be set in order for the LNX provider to return a list of fi_info + blocks in the fi_getinfo() call. The format which must be used is: + \+\+... As mentioned earlier currently LNX supports linking + only two providers the first of which is SHM followed by one other + provider for inter-node operations + +*FI_LNX_DISABLE_SHM* +: By default this environment variable is set to 0. However, the user can + set it to one and then the SHM provider will not be used. This can be + useful for debugging and performance analysis. The SHM provider will + naturally be used for all intra-node operations. Therefore, to test SHM in + isolation with LNX, the processes can be limited to the same node only. + +*FI_LNX_USE_SRQ* +: Shared Receive Queues are integral part of the peer infrastructure, but + they have the limitation of not using hardware offload, such as tag + matching. SRQ is needed to support the FI_ADDR_UNSPEC case. If the application + is sure this will never be the case, then it can turn off SRQ support by + setting this environment variable to 0. It is 1 by default. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), +[`fi_getinfo`(3)](fi_getinfo.3.html) diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 3a8e1fcd554..be43f409c8e 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -139,6 +139,14 @@ attributes (mr_mode field). Each mr_mode bit requires that an application take specific steps in order to use memory buffers with libfabric interfaces. +As a special case, a new memory region can be created from an existing +memory region. Such a new memory region is called a sub-MR, and the existing +memory region is called the base MR. Sub-MRs may be used to shared hardware +resources, such as virtual to physical address translations and page pinning. +This can improve performance when creating and destroying sub-regions that +need different access rights. The base MR itself can also be a sub-MR, +allowing for a hierarchy of memory regions. + The following apply to memory registration. *Default Memory Registration* @@ -575,8 +583,8 @@ into calls as function parameters. ```c struct fi_mr_attr { union { - const struct iovec *mr_iov; - const struct fi_mr_dmabuf *dmabuf; + const struct iovec *mr_iov; + const struct fi_mr_dmabuf *dmabuf; }; size_t iov_count; uint64_t access; @@ -595,6 +603,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_auth_key { @@ -810,6 +820,31 @@ or from the region. Providers may choose to ignore page size. This will result in a provider selected page size always being used. +## base_mr + +If non-NULL, create a sub-MR from an existing memory region specified by +the base_mr field. + +The sub-MR must be fully contained within the base MR; however, the sub-MR +has its own authorization keys and access rights. The following attributes +are inherited from the base MR, and as a result, are ignored when creating the +sub-MR: + +iface, device, hmem_data, page_size + +The sub-MR should hold a reference to the base MR. When fi_close is called +on the base MR, the call would fail if there are any outstanding sub-MRs. + +The base_mr field must be NULL if the FI_MR_DMABUF flag is set. + +## sub_mr_cnt + +The number of sub-MRs expected to be created from the memory region. This +value is not a limit. Instead, it is a hint to the provider to allow provider +specific optimization for sub-MR creation. For example, the provider may +reserve access keys or pre-allocation fid_mr objects. The provider may +ignore this hint. + ## fi_hmem_ze_device Returns an hmem device identifier for a level zero tuple. @@ -900,6 +935,12 @@ The follow flag may be specified to any memory registration call. fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. +*FI_MR_SINGLE_USE* +: This flag indicates that the memory region is only used for a single + operation. After the operation is complete, the key associated with the + memory region is automatically invalidated and can no longer be used for + remote access. + *FI_AUTH_KEY* : Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the fi_mr_auth_key::src_addr field @@ -1013,12 +1054,13 @@ configure registration caches. : The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an application and the underlying physical pages. Valid monitor options are: userfaultfd, memhooks, - and disabled. Selecting disabled will turn off the registration cache. + kdreg2, and disabled. Selecting disabled will turn off the registration cache. Userfaultfd is a Linux kernel feature used to report virtual to physical address mapping changes to user space. Memhooks operates by intercepting relevant memory allocation and deallocation calls which may result in the mappings changing, such as malloc, mmap, free, etc. Note that memhooks - operates at the elf linker layer, and does not use glibc memory hooks. + operates at the elf linker layer, and does not use glibc memory hooks. Kdreg2 + is supplied as a loadable Linux kernel module. *FI_MR_CUDA_CACHE_MONITOR_ENABLED* : The CUDA cache monitor is responsible for detecting CUDA device memory diff --git a/man/fi_msg.3.md b/man/fi_msg.3.md index 4b6e67cf876..1dd5ecd5ae5 100644 --- a/man/fi_msg.3.md +++ b/man/fi_msg.3.md @@ -173,6 +173,11 @@ to write CQ entries for all successful completions. See the flags discussion below for more details. The requested message size that can be used with fi_inject is limited by inject_size. +If FI_HMEM is enabled, the fi_inject call can only accept buffer with +iface equal to FI_HMEM_SYSTEM if the provider requires the FI_MR_HMEM +mr_mode. This limitation applies to all the fi_\*inject\* calls and +does not affect how inject_size is reported. + ## fi_senddata The send data call is similar to fi_send, but allows for the sending diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index 4aa6f60a482..da3a638e21f 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -134,6 +134,18 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. Default setting is 64. +*FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS* +: Integer. This setting controls how many PING requests the reliability/replay + function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX in situations + with less contending outgoing traffic from the HFI. + Default setting is 128. Range of valid values is 1-65535. + +*FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS* +: Integer. This setting controls how many PING requests the reliability/replay + function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX in situations + with more contending, outgoing traffic from the HFI. + Default setting is 4. Range of valid values is 1-65535. + *FI_OPX_SELINUX* : Boolean (0/1, on/off, true/false, yes/no). Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system @@ -207,6 +219,11 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. For messages smaller than this threshold, the send will be completed using PIO. Value must be between 64 and 2147483646. Defaults to 16385. +*FI_OPX_TID_MIN_PAYLOAD_BYTES* +: Integer. The minimum length in bytes where TID (Expected Receive) will be used. + For messages smaller than this threshold, the send will be completed using Eager Receive. + Value must be between 4096 and 2147483646. Defaults to 4096. + *FI_OPX_RZV_MIN_PAYLOAD_BYTES* : Integer. The minimum length in bytes where rendezvous will be used. For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager. @@ -247,6 +264,10 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. The default threshold is 8192. This has no meaning if Libfabric was not configured with GDRCopy or ROCR support. +*FI_OPX_MIXED_NETWORK* +: Integer. Indicates that the network is a mix of OPA100 and CN5000. Needs to be set to 1 + in case of mixed network. Default is 0. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/man/fi_peer.3.md b/man/fi_peer.3.md index 0dc4dd43077..fc58c16f507 100644 --- a/man/fi_peer.3.md +++ b/man/fi_peer.3.md @@ -83,6 +83,20 @@ similar, independent from the object being shared. However, because the goal of using peer providers is to avoid overhead, providers must be explicitly written to support the peer provider mechanisms. +When importing any shared fabric object into a peer, the owner will create a +separate fid_peer_* for each peer provider it intends to import into. The owner +will pass this unique fid_peer_* into each peer through the context parameter of +the init call for the resource (i.e. fi_cq_open, fi_srx_context, fi_cntr_open, +etc). The fi_peer_*_context will indicate the owner-allocated fid_peer_* for +the peer to use but is temporary for the init call and may not be accessed by +the peer after initialization. The peer will set just the peer_ops of the +owner-allocated fid and save a reference to the imported fid_peer_* for use in +the peer API flow. The peer will allocate its own fid for internal uses and +return that fid to the owner through the regular fid parameter of the init call +(as if it were just another opened resource). The owner is responsible for +saving the returned peer fid from the open call in order to close it later +(or to drive progress in the case of the cq_fid). + There are two peer provider models. In the example listed above, both peers are full providers in their own right and usable in a stand-alone fashion. In a second model, one of the peers is known as an offload provider. An diff --git a/man/fi_provider.7.md b/man/fi_provider.7.md index ba820906a12..9d6684a5543 100644 --- a/man/fi_provider.7.md +++ b/man/fi_provider.7.md @@ -77,6 +77,10 @@ operating system support is available, etc. This list is not exhaustive. hardware interface for inter-instance communication on EC2. See [`fi_efa`(7)](fi_efa.7.html) for more information. +*LPP* +: A provider runs on FabreX PCIe networks. See + [`fi_lpp`(7)](fi_lpp.7.html) for more information. + *OPX* : Supports Omni-Path networking from Cornelis Networks. See [`fi_opx`(7)](fi_opx.7.html) for more information. @@ -156,6 +160,19 @@ An offload provider is intended to accelerate specific types of communication, generally by taking advantage of network services that have been offloaded into hardware, though actual hardware offload support is not a requirement. +# LINKx (LNX) provider (Technology Preview) + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +See [`fi_lnx`(7)](fi_lnx.7.html) for more information. + # SEE ALSO [`fabric`(7)](fabric.7.html) diff --git a/man/fi_setup.7.md b/man/fi_setup.7.md index a5afa99b33c..75f60a05642 100644 --- a/man/fi_setup.7.md +++ b/man/fi_setup.7.md @@ -135,11 +135,11 @@ requested, a provider must support a capability if it is asked for or fail the fi_getinfo request. A provider may optionally report non-requested secondary capabilities if doing so would not compromise performance or security. That is, a provider may grant an application a secondary capability, -whether the application. The most commonly accessed secondary capability bits -indicate if provider communication is restricted to the local node Ifor example, -the shared memory provider only supports local communication) and/or remote -nodes (which can be the case for NICs that lack loopback support). Other -secondary capability bits mostly deal with features targeting highly-scalable +regardless of whether the application requested it. The most commonly accessed +secondary capability bits indicate if provider communication is restricted to the +local node (for example, the shared memory provider only supports local communication) +and/or remote nodes (which can be the case for NICs that lack loopback support). +Other secondary capability bits mostly deal with features targeting highly-scalable applications, but may not be commonly supported across multiple providers. Because different providers support different sets of capabilities, applications diff --git a/man/fi_tagged.3.md b/man/fi_tagged.3.md index 901a2b648cc..ec8c8ab8eb7 100644 --- a/man/fi_tagged.3.md +++ b/man/fi_tagged.3.md @@ -264,6 +264,24 @@ and/or fi_tsendmsg. local buffer and transfer out of that buffer. This flag can only be used with messages smaller than inject_size. +*FI_MULTI_RECV* +: Applies to posted tagged receive operations when the FI_TAGGED_MULTI_RECV + capability is enabled. This flag allows the user to post a single + tagged receive buffer that will receive multiple incoming messages. + Received messages will be packed into the receive buffer until the + buffer has been consumed. Use of this flag may cause a single + posted receive operation to generate multiple events as messages are + placed into the buffer. The placement of received data into the + buffer may be subjected to provider specific alignment restrictions. + + The buffer will be released by the provider when the available buffer + space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). + Note that an entry to the associated receive completion queue will + always be generated when the buffer has been consumed, even if other + receive completions have been suppressed (i.e. the Rx context has been + configured for FI_SELECTIVE_COMPLETION). See the FI_MULTI_RECV + completion flag [`fi_cq`(3)](fi_cq.3.html). + *FI_INJECT_COMPLETE* : Applies to fi_tsendmsg. Indicates that a completion should be generated when the source buffer(s) may be reused. @@ -292,9 +310,9 @@ and/or fi_tsendmsg. *FI_AUTH_KEY* : Only valid with domains configured with FI_AV_AUTH_KEY and connectionless - endpoints configured with FI_DIRECTED_RECV. When used with fi_trecvmsg, this - flag denotes that the src_addr is an authorization key fi_addr_t instead of - an endpoint fi_addr_t. + endpoints configured with FI_DIRECTED_RECV or FI_TAGGED_DIRECTED_RECV. When + used with fi_trecvmsg, this flag denotes that the src_addr is an authorization + key fi_addr_t instead of an endpoint fi_addr_t. The following flags may be used with fi_trecvmsg. diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index 6f1dbc213e8..657f11afe97 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_info" "1" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_info" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -56,7 +70,7 @@ For more information on address formats, see fi_getinfo(3). .TP \f[I]-p, \[en]provider=\f[R] Filter fabric interfaces by the provider implementation. -For a list of providers, see the \f[C]--list\f[R] option. +For a list of providers, see the \f[V]--list\f[R] option. .TP \f[I]-d, \[en]domain=\f[R] Filter interfaces to only those with the given domain name. @@ -130,7 +144,7 @@ provider: tcp \f[R] .fi .PP -To see the full fi_info structure, specify the \f[C]-v\f[R] option. +To see the full fi_info structure, specify the \f[V]-v\f[R] option. .IP .nf \f[C] @@ -223,7 +237,7 @@ fi_info: \f[R] .fi .PP -To see libfabric related environment variables \f[C]-e\f[R] option. +To see libfabric related environment variables \f[V]-e\f[R] option. .IP .nf \f[C] @@ -243,7 +257,7 @@ $ ./fi_info -e .fi .PP To see libfabric related environment variables with substring use -\f[C]-g\f[R] option. +\f[V]-g\f[R] option. .IP .nf \f[C] @@ -281,6 +295,6 @@ $ ./fi_info -g tcp .fi .SH SEE ALSO .PP -\f[C]fi_getinfo(3)\f[R], \f[C]fi_endpoint(3)\f[R] +\f[V]fi_getinfo(3)\f[R], \f[V]fi_endpoint(3)\f[R] .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_pingpong.1 b/man/man1/fi_pingpong.1 index eced1397e6c..843db111cd7 100644 --- a/man/man1/fi_pingpong.1 +++ b/man/man1/fi_pingpong.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_pingpong" "1" "2024\-04\-04" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_pingpong" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -21,7 +35,7 @@ fi_pingpong also displays aggregated statistics after each test run, and can additionally verify data integrity upon receipt. .PP By default, the datagram (FI_EP_DGRAM) endpoint is used for the test, -unless otherwise specified via \f[C]-e\f[R]. +unless otherwise specified via \f[V]-e\f[R]. .SH HOW TO RUN TESTS .PP Two copies of the program must be launched: first, one copy must be @@ -47,15 +61,15 @@ client$ fi_pingpong .PP The server and client must be able to communicate properly for the fi_pingpong utility to function. -If any of the \f[C]-e\f[R], \f[C]-I\f[R], \f[C]-S\f[R], or \f[C]-p\f[R] +If any of the \f[V]-e\f[R], \f[V]-I\f[R], \f[V]-S\f[R], or \f[V]-p\f[R] options are used, then they must be specified on the invocation for both the server and the client process. -If the \f[C]-d\f[R] option is specified on the server, then the client +If the \f[V]-d\f[R] option is specified on the server, then the client will select the appropriate domain if no hint is provided on the client side. -If the \f[C]-d\f[R] option is specified on the client, then it must also +If the \f[V]-d\f[R] option is specified on the client, then it must also be specified on the server. -If both the server and client specify the \f[C]-d\f[R] option and the +If both the server and client specify the \f[V]-d\f[R] option and the given domains cannot communicate, then the application will fail. .SS Control Messaging .TP @@ -110,19 +124,19 @@ Activate output debugging (warning: highly verbose) Displays help output for the pingpong test. .SH USAGE EXAMPLES .SS A simple example -.SS Server: \f[C]fi_pingpong -p \f[R] +.SS Server: \f[V]fi_pingpong -p \f[R] .PP -\f[C]server$ fi_pingpong -p sockets\f[R] -.SS Client: \f[C]fi_pingpong -p \f[R] +\f[V]server$ fi_pingpong -p sockets\f[R] +.SS Client: \f[V]fi_pingpong -p \f[R] .PP -\f[C]client$ fi_pingpong -p sockets 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p sockets 192.168.0.123\f[R] .SS An example with various options .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] +\f[V]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] .PP Specifically, this will run a pingpong test with: .IP \[bu] 2 @@ -136,14 +150,14 @@ server node as 192.168.0.123 .SS A longer test .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] +\f[V]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] .SH DEFAULTS .PP There is no default provider; if a provider is not specified via the -\f[C]-p\f[R] switch, the test will pick one from the list of available +\f[V]-p\f[R] switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). .PP If no endpoint type is specified, `dgram' is used. @@ -178,6 +192,6 @@ client per second .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3) \f[C]fabric\f[R](7), +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3) \f[V]fabric\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_strerror.1 b/man/man1/fi_strerror.1 index d652db9229f..6293860cab7 100644 --- a/man/man1/fi_strerror.1 +++ b/man/man1/fi_strerror.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_strerror" "1" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_strerror" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -14,16 +28,16 @@ fi_strerror FI_ERROR_CODE .fi .SH DESCRIPTION .PP -Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[R]. -\f[C]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal +Display the error string for the given numeric \f[V]FI_ERROR_CODE\f[R]. +\f[V]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal constant. -Although the \f[C]fi_strerror\f[R](3) library function only accepts +Although the \f[V]fi_strerror\f[R](3) library function only accepts positive error values, for convenience this utility accepts both positive and negative error values. .PP This is primarily a convenience tool for developers. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_errno\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_errno\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_atomic.3 b/man/man3/fi_atomic.3 index 9c629486cdf..71e63395a8d 100644 --- a/man/man3/fi_atomic.3 +++ b/man/man3/fi_atomic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_atomic" "3" "2024\-08\-06" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_atomic" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -129,7 +143,7 @@ Local data buffer to store initial value of remote buffer \f[I]desc / compare_desc / result_desc\f[R] Data descriptor associated with the local data buffer, local compare buffer, and local result buffer, respectively. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]dest_addr\f[R] Destination address for connectionless atomic operations. @@ -693,11 +707,11 @@ parameter specifying the tag. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -741,11 +755,11 @@ assigned to the transmitting and receiving endpoints. Both message and data ordering are required if the results of two atomic operations to the same memory buffers are to reflect the second operation acting on the results of the first. -See \f[C]fi_endpoint\f[R](3) for further details and message size +See \f[V]fi_endpoint\f[R](3) for further details and message size restrictions. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_rma\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_rma\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av.3 b/man/man3/fi_av.3 index 49ebc052318..baff5058852 100644 --- a/man/man3/fi_av.3 +++ b/man/man3/fi_av.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av" "3" "2024\-08\-06" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -149,14 +163,14 @@ been deprecated, see below). See the NOTES section for AV restrictions on duplicate addresses. .PP \f[B]Deprecated\f[R]: AV operations may be set to operate asynchronously -by specifying the FI_EVENT flag to \f[C]fi_av_open\f[R]. +by specifying the FI_EVENT flag to \f[V]fi_av_open\f[R]. When requesting asynchronous operation, the application must first bind an event queue to the AV before inserting addresses. .SS fi_av_open .PP fi_av_open allocates or opens an address vector. The properties and behavior of the address vector are defined by -\f[C]struct fi_av_attr\f[R]. +\f[V]struct fi_av_attr\f[R]. .IP .nf \f[C] @@ -287,7 +301,7 @@ The context field in all completions will be the context specified to the insert call, and the data field in the final completion entry will report the number of addresses successfully inserted. If an error occurs during the asynchronous insertion, an error -completion entry is returned (see \f[C]fi_eq\f[R](3) for a discussion of +completion entry is returned (see \f[V]fi_eq\f[R](3) for a discussion of the fi_eq_err_entry error completion struct). The context field of the error completion will be the context that was specified in the insert call; the data field will contain the index of @@ -347,10 +361,10 @@ the call will return -FI_EBUSY. .SS fi_av_bind (deprecated) .PP Associates an event queue with the AV. -If an AV has been opened with \f[C]FI_EVENT\f[R], then an event queue +If an AV has been opened with \f[V]FI_EVENT\f[R], then an event queue must be bound to the AV before any insertion calls are attempted. Any calls to insert addresses before an event queue has been bound will -fail with \f[C]-FI_ENOEQ\f[R]. +fail with \f[V]-FI_ENOEQ\f[R]. Flags are reserved for future use and must be 0. .SS fi_av_insert .PP @@ -361,7 +375,7 @@ AV. Addresses inserted into an address vector must be in the same format as specified in the addr_format field of the fi_info struct provided when opening the corresponding domain. -When using the \f[C]FI_ADDR_STR\f[R] format, the \f[C]addr\f[R] +When using the \f[V]FI_ADDR_STR\f[R] format, the \f[V]addr\f[R] parameter should reference an array of strings (char **). .PP \f[B]Deprecated\f[R]: For AV\[cq]s of type FI_AV_MAP, once inserted @@ -395,14 +409,14 @@ buffer must remain valid until the insertion operation completes. Note that if fi_addr is NULL and synchronous operation is requested without using FI_SYNC_ERR flag, individual insertion failures cannot be reported and the application must use other calls, such as -\f[C]fi_av_lookup\f[R] to learn which specific addresses failed to +\f[V]fi_av_lookup\f[R] to learn which specific addresses failed to insert. .PP If the address vector is configured with authorization keys, the fi_addr parameter may be used as input to define the authorization keys associated with the endpoint addresses being inserted. This is done by setting the fi_addr to an authorization key fi_addr_t -generated from \f[C]fi_av_insert_auth_key\f[R] and setting the +generated from \f[V]fi_av_insert_auth_key\f[R] and setting the FI_AUTH_KEY flag. If the FI_AUTH_KEY flag is not set, addresses being inserted will not be associated with any authorization keys. @@ -416,9 +430,10 @@ authorization keys. These fi_addr_t\[cq]s can be used as the target for local data transfer operations. .PP -If the endpoint supports \f[C]FI_DIRECTED_RECV\f[R], these -fi_addr_t\[cq]s can be used to restrict receive buffers to a specific -endpoint address and authorization key. +If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R] or +\f[V]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to +restrict receive buffers to a specific endpoint address and +authorization key. .PP For address vectors configured with FI_AV_USER_ID, all subsequent target events corresponding to the address being inserted will return @@ -479,10 +494,10 @@ Node should be a string that corresponds to a hostname or network address. The service string corresponds to a textual representation of a transport address. -Applications may also pass in an \f[C]FI_ADDR_STR\f[R] formatted address +Applications may also pass in an \f[V]FI_ADDR_STR\f[R] formatted address as the node parameter. In such cases, the service parameter must be NULL. -See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[R]. +See fi_getinfo.3 for details on using \f[V]FI_ADDR_STR\f[R]. Supported flags are the same as for fi_av_insert. .SS fi_av_insertsym .PP @@ -526,7 +541,7 @@ Note that removing an address may not disable receiving data from the peer endpoint. fi_av_close will automatically cleanup any associated resource. .PP -If the address being removed came from \f[C]fi_av_insert_auth_key\f[R], +If the address being removed came from \f[V]fi_av_insert_auth_key\f[R], the address will only be removed if all endpoints, which have been enabled against the corresponding authorization key, have been closed. If all endpoints are not closed, -FI_EBUSY will be returned. @@ -576,8 +591,8 @@ fi_av_straddr returns a pointer to buf. .SS fi_av_insert_auth_key .PP This function associates authorization keys with an address vector. -This requires the domain to be opened with \f[C]FI_AV_AUTH_KEY\f[R]. -\f[C]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be +This requires the domain to be opened with \f[V]FI_AV_AUTH_KEY\f[R]. +\f[V]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be associated with authorization keys from the address vector. This behavior enables a single endpoint or memory region to be associated with multiple authorization keys. @@ -587,38 +602,38 @@ address vector authorization keys at that point in time. Later authorization key insertions will not propagate to already enabled endpoints and memory regions. .PP -The \f[C]auth_key\f[R] and \f[C]auth_key_size\f[R] parameters are used +The \f[V]auth_key\f[R] and \f[V]auth_key_size\f[R] parameters are used to input the authorization key into the address vector. The structure of the authorization key is provider specific. -If the \f[C]auth_key_size\f[R] does not align with provider specific +If the \f[V]auth_key_size\f[R] does not align with provider specific structure, -FI_EINVAL will be returned. .PP -The output of \f[C]fi_av_insert_auth_key\f[R] is an authorization key +The output of \f[V]fi_av_insert_auth_key\f[R] is an authorization key fi_addr_t handle representing all endpoint addresses against this specific authorization key. For all operations, including address vector, memory registration, and data transfers, which may accept an authorization key fi_addr_t as input, the FI_AUTH_KEY flag must be specified. Otherwise, the fi_addr_t will be treated as an fi_addr_t returned from -the \f[C]fi_av_insert\f[R] and related functions. +the \f[V]fi_av_insert\f[R] and related functions. .PP For endpoints enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict incoming messages to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_recvmsg\f[R] and \f[C]fi_trecvmsg\f[R]. +\f[V]fi_recvmsg\f[R] and \f[V]fi_trecvmsg\f[R]. .PP For domains enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict memory region access to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_mr_regattr\f[R]. +\f[V]fi_mr_regattr\f[R]. .PP These authorization key fi_addr_t\[cq]s can later be used an input for endpoint address insertion functions to generate an fi_addr_t for a specific endpoint address and authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_av_insert\f[R] and related functions. +\f[V]fi_av_insert\f[R] and related functions. .PP For address vectors configured with FI_AV_USER_ID and endpoints with FI_SOURCE_ERR, all subsequent FI_EADDRNOTAVAIL error events will return @@ -636,7 +651,7 @@ Flags are reserved for future use and must be 0. This functions returns the authorization key associated with a fi_addr_t. Acceptable fi_addr_t\[cq]s input are the output of -\f[C]fi_av_insert_auth_key\f[R] and AV address insertion functions. +\f[V]fi_av_insert_auth_key\f[R] and AV address insertion functions. The returned authorization key is in a provider specific format. On input, the auth_key_size parameter should indicate the size of the auth_key buffer. @@ -745,14 +760,14 @@ function. This function is used to set the group ID portion of an fi_addr_t. .SH RETURN VALUES .PP -Insertion calls, excluding \f[C]fi_av_insert_auth_key\f[R], for an AV +Insertion calls, excluding \f[V]fi_av_insert_auth_key\f[R], for an AV opened for synchronous operation will return the number of addresses that were successfully inserted. In the case of failure, the return value will be less than the number of addresses that was specified. .PP \f[B]Deprecated\f[R]: Insertion calls, excluding -\f[C]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous +\f[V]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous operation (with FI_EVENT flag specified) will return FI_SUCCESS if the operation was successfully initiated. In the case of failure, a negative fabric errno will be returned. @@ -767,10 +782,10 @@ FI_ADDR_NOTAVAIL. .PP All other calls return FI_SUCCESS on success, or a negative value corresponding to fabric errno on error. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av_set.3 b/man/man3/fi_av_set.3 index f64f51c7d9c..6e0b0030ee2 100644 --- a/man/man3/fi_av_set.3 +++ b/man/man3/fi_av_set.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av_set" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av_set" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -92,7 +106,7 @@ The creation and manipulation of an AV set is a local operation. No fabric traffic is exchanged between peers. As a result, each peer is responsible for creating matching AV sets as part of their collective membership definition. -See \f[C]fi_collective\f[R](3) for a discussion of membership models. +See \f[V]fi_collective\f[R](3) for a discussion of membership models. .SS fi_av_set .PP The fi_av_set call creates a new AV set. @@ -263,9 +277,9 @@ It is an error for a user to request an unsupported collective. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_av\f[R](3), \f[C]fi_collective\f[R](3) +\f[V]fi_av\f[R](3), \f[V]fi_collective\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cm.3 b/man/man3/fi_cm.3 index 85c8d2b5ea9..1c8c247d9a5 100644 --- a/man/man3/fi_cm.3 +++ b/man/man3/fi_cm.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cm" "3" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cm" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -261,7 +275,7 @@ or an error will occur. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH NOTES .PP @@ -279,7 +293,7 @@ events, or as additional err_data to fi_eq_err_entry, in the case of a rejected connection. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index b01c900efea..7f2e2f3b058 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cntr" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cntr" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -91,14 +105,14 @@ That is, a counter actually stores two distinct values, with error completions updating an error specific value. .PP Counters are updated following the completion event semantics defined in -\f[C]fi_cq\f[R](3). +\f[V]fi_cq\f[R](3). The timing of the update is based on the type of transfer and any specified operation flags. .SS fi_cntr_open .PP fi_cntr_open allocates a new fabric counter. The properties and behavior of the counter are defined by -\f[C]struct fi_cntr_attr\f[R]. +\f[V]struct fi_cntr_attr\f[R]. .IP .nf \f[C] @@ -278,7 +292,7 @@ On error, a negative value corresponding to fabric errno is returned. fi_cntr_read / fi_cntr_readerr Returns the current value of the counter. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP In order to support a variety of counter implementations, updates made @@ -300,7 +314,7 @@ fi_cntr_set / fi_cntr_seterr and results of related operations are reflected in the observed value of the counter. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_collective.3 b/man/man3/fi_collective.3 index 1343d121c74..ead102d60c2 100644 --- a/man/man3/fi_collective.3 +++ b/man/man3/fi_collective.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_collective" "3" "2024\-01\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_collective" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -152,7 +166,7 @@ be used for required input. .PP In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. -Readers should refer to the \f[C]fi_atomic\f[R](3) man page for details +Readers should refer to the \f[V]fi_atomic\f[R](3) man page for details on the atomic operations and datatypes defined by libfabric. .PP A collective operation is a group communication exchange. @@ -199,7 +213,7 @@ provider by creating and configuring an address vector set (AV set). An AV set represents an ordered subset of addresses in an address vector (AV). Details on creating and configuring an AV set are available in -\f[C]fi_av_set\f[R](3). +\f[V]fi_av_set\f[R](3). .PP Once an AV set has been programmed with the collective membership information, an endpoint is joined to the set. @@ -258,7 +272,7 @@ Applications must call fi_close on the collective group to disconnect the endpoint from the group. After a join operation has completed, the fi_mc_addr call may be used to retrieve the address associated with the multicast group. -See \f[C]fi_cm\f[R](3) for additional details on fi_mc_addr(). +See \f[V]fi_cm\f[R](3) for additional details on fi_mc_addr(). .SS Barrier (fi_barrier) .PP The fi_barrier operation provides a mechanism to synchronize peers. @@ -509,7 +523,7 @@ struct fi_collective_attr { \f[R] .fi .PP -For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[R](3). +For a description of struct fi_atomic_attr, see \f[V]fi_atomic\f[R](3). .TP \f[I]op\f[R] On input, this specifies the atomic operation involved with the @@ -552,7 +566,7 @@ collective operation through the provider. .PP Collective operations map to underlying fi_atomic operations. For a discussion of atomic completion semantics, see -\f[C]fi_atomic\f[R](3). +\f[V]fi_atomic\f[R](3). The completion, ordering, and atomicity of collective operations match those defined for point to point atomic operations. .SH FLAGS @@ -567,11 +581,11 @@ collective operation. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -587,11 +601,11 @@ As such, they follow most of the conventions and restrictions as peer to peer atomic operations. This includes data atomicity, data alignment, and message ordering semantics. -See \f[C]fi_atomic\f[R](3) for additional information on the datatypes +See \f[V]fi_atomic\f[R](3) for additional information on the datatypes and operations defined for atomic and collective operations. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_av\f[R](3), \f[C]fi_atomic\f[R](3), -\f[C]fi_cm\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_av\f[R](3), \f[V]fi_atomic\f[R](3), +\f[V]fi_cm\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_control.3 b/man/man3/fi_control.3 index 1e853d74718..2a6eec2f644 100644 --- a/man/man3/fi_control.3 +++ b/man/man3/fi_control.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_control" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_control" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -50,7 +64,7 @@ header files (\[cq]rdma/fi_ext_*.h\[cq]). Please refer to the provider man pages for details. .SH SEE ALSO .PP -\f[C]fi_endpoint\f[R](3), \f[C]fi_cm\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_cq\f[R](3), \f[C]fi_eq\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_cm\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_cq\f[R](3), \f[V]fi_eq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index da07f4fcb2f..232c9dfad0d 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cq" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cq" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -123,7 +137,7 @@ Unlike event queues, completion queues are associated with a resource domain and may be offloaded entirely in provider hardware. .PP The properties and behavior of a completion queue are defined by -\f[C]struct fi_cq_attr\f[R]. +\f[V]struct fi_cq_attr\f[R]. .IP .nf \f[C] @@ -354,8 +368,9 @@ Multiple completions may be retrieved from a CQ in a single call. The maximum number of entries to return is limited to the specified count parameter, with the number of entries successfully read from the CQ returned by the call. -(See return values section below.) A count value of 0 may be used to -drive progress on associated endpoints when manual progress is enabled. +(See return values section below.) +A count value of 0 may be used to drive progress on associated endpoints +when manual progress is enabled. .PP CQs are optimized to report operations which have completed successfully. @@ -429,7 +444,7 @@ fi_cq_readerr is a non-blocking call, returning immediately whether an error completion was found or not. .PP Error information is reported to the user through -\f[C]struct fi_cq_err_entry\f[R]. +\f[V]struct fi_cq_err_entry\f[R]. The format of this structure is defined below. .IP .nf @@ -522,8 +537,9 @@ Flags are set for all relevant completions. .TP \f[I]len\f[R] This len field applies to completed receive operations (e.g.\ fi_recv, -fi_trecv, etc.) and the completed write with remote cq data on the -responder side (e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). +fi_trecv, etc.) +and the completed write with remote cq data on the responder side +(e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). It indicates the size of transferred \f[I]message\f[R] data \[en] i.e.\ how many data bytes were placed into the associated receive/target buffer by a corresponding fi_send/fi_tsend/fi_write et al call. @@ -954,7 +970,7 @@ When heterogenous memory is involved, the concept of memory domains come into play. Memory domains identify the physical separation of memory, which may or may not be accessible through the same virtual address space. -See the \f[C]fi_mr\f[R](3) man page for further details on memory +See the \f[V]fi_mr\f[R](3) man page for further details on memory domains. .PP Completion ordering and data visibility are only well-defined for @@ -1014,7 +1030,7 @@ As a result, applications can request a lower completion semantic when posting receives. That indicates to the provider that the application will be responsible for handling any device specific flush operations that might be needed. -See \f[C]fi_msg\f[R](3) FLAGS. +See \f[V]fi_msg\f[R](3) FLAGS. .PP For data transfers that do not generate a completion at the target, such as RMA or atomics, it is the responsibility of the application to ensure @@ -1117,11 +1133,11 @@ returns -FI_EAGAIN. : Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index 539d7e6919e..be3d6c8af5d 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_domain" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_domain" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -76,7 +90,7 @@ parameter. .PP Similar to fi_domain, but accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening peer domain. -See \f[C]fi_peer\f[R](3). +See \f[V]fi_peer\f[R](3). .SS fi_open_ops .PP fi_open_ops is used to open provider specific interfaces. @@ -173,9 +187,9 @@ through the event queue. If an event queue is not bound to the domain with the FI_REG_MR flag, then memory registration requests complete synchronously. .PP -See \f[C]fi_av_bind\f[R](3), \f[C]fi_ep_bind\f[R](3), -\f[C]fi_mr_bind\f[R](3), \f[C]fi_pep_bind\f[R](3), and -\f[C]fi_scalable_ep_bind\f[R](3) for more information. +See \f[V]fi_av_bind\f[R](3), \f[V]fi_ep_bind\f[R](3), +\f[V]fi_mr_bind\f[R](3), \f[V]fi_pep_bind\f[R](3), and +\f[V]fi_scalable_ep_bind\f[R](3) for more information. .SS fi_close .PP The fi_close call is used to release all resources associated with a @@ -184,7 +198,7 @@ All objects associated with the opened domain must be released prior to calling fi_close, otherwise the call will return -FI_EBUSY. .SH DOMAIN ATTRIBUTES .PP -The \f[C]fi_domain_attr\f[R] structure defines the set of attributes +The \f[V]fi_domain_attr\f[R] structure defines the set of attributes associated with a domain. .IP .nf @@ -649,7 +663,7 @@ size as the endpoint queue(s) that are bound to it. .SS AV Type (av_type) .PP Specifies the type of address vectors that are usable with this domain. -For additional details on AV type, see \f[C]fi_av\f[R](3). +For additional details on AV type, see \f[V]fi_av\f[R](3). The following values may be specified. .TP \f[I]FI_AV_MAP\f[R] (deprecated) @@ -673,7 +687,7 @@ optimal AV type supported by this domain. .SS Memory Registration Mode (mr_mode) .PP Defines memory registration specific mode bits used with this domain. -Full details on MR mode options are available in \f[C]fi_mr\f[R](3). +Full details on MR mode options are available in \f[V]fi_mr\f[R](3). The following values may be specified. .TP \f[I]FI_MR_ALLOCATED\f[R] @@ -854,7 +868,7 @@ If this domain capability is not set, address vectors cannot be opened with FI_AV_USER_ID. Note that FI_AV_USER_ID can still be supported through the AV insert calls without this domain capability set. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_PEER\f[R] Specifies that the domain must support importing resources to be used in @@ -885,7 +899,7 @@ provider, for example. Indicates that the domain supports the ability to share address vectors among multiple processes using the named address vector feature. .PP -See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary +See \f[V]fi_getinfo\f[R](3) for a discussion on primary versus secondary capabilities. .SS Default authorization key (auth_key) .PP @@ -932,7 +946,7 @@ cache or lookup tables. .PP This specifies the default traffic class that will be associated any endpoints created within the domain. -See \f[C]fi_endpoint\f[R](3) for additional information. +See \f[V]fi_endpoint\f[R](3) for additional information. .SS Max Authorization Keys per Endpoint (max_ep_auth_key) .PP The maximum number of authorization keys which can be supported per @@ -941,7 +955,7 @@ connectionless endpoint. .PP The maximum value that a peer group may be assigned, inclusive. Valid peer group id\[cq]s must be between 0 and max_group_id. -See \f[C]fi_av\f[R](3) for additional information on peer groups and +See \f[V]fi_av\f[R](3) for additional information on peer groups and their use. Users may request support for peer groups by setting this to a non-zero value. @@ -953,7 +967,7 @@ the application. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -972,7 +986,7 @@ lightly loaded systems, without an administrator configuring system resources appropriately for the installed provider(s). .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_mr\f[R](3) \f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_mr\f[R](3) \f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index a9dad7e3d40..f03197d02b2 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_endpoint" "3" "2024\-08\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_endpoint" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -263,7 +277,7 @@ been used. .PP Similar to fi_endpoint, buf accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening endpoints that use peer transfer feature. -See \f[C]fi_peer\f[R](3) +See \f[V]fi_peer\f[R](3) .SS fi_close .PP Closes an endpoint and release all resources associated with it. @@ -576,7 +590,7 @@ FI_HMEM_P2P_DISABLED: Peer to peer support should not be used. fi_setopt() will return -FI_EOPNOTSUPP if the mode requested cannot be supported by the provider. The FI_HMEM_DISABLE_P2P environment variable discussed in -\f[C]fi_mr\f[R](3) takes precedence over this setopt option. +\f[V]fi_mr\f[R](3) takes precedence over this setopt option. .RE \[bu] .RS 2 .TP @@ -609,10 +623,10 @@ Define the maximum message size that can be transferred by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -621,10 +635,10 @@ Define the maximum message size that can be transferred by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -633,10 +647,10 @@ Define the maximum message size that can be transferred by the endpoint via a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -645,10 +659,10 @@ Define the maximum data size that can be transferred by the endpoint via a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -657,10 +671,10 @@ Define the maximum message size that can be injected by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -669,10 +683,10 @@ Define the maximum message size that can be injected by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -681,10 +695,10 @@ Define the maximum data size that can be injected by the endpoint in a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -693,10 +707,10 @@ Define the maximum data size that can be injected by the endpoint in a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE .SS fi_tc_dscp_set .PP @@ -1488,8 +1502,9 @@ capability bits from the fi_info structure will be used. .PP The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_MULTI_RECV, -FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, FI_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. .PP Many applications will be able to ignore this field and rely solely on the fi_info::caps field. @@ -1778,7 +1793,7 @@ value of transmit or receive context attributes of an endpoint. \f[I]FI_COMMIT_COMPLETE\f[R] Indicates that a completion should not be generated (locally or at the peer) until the result of an operation have been made persistent. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_COMPLETION\f[R] Indicates that a completion queue entry should be written for data @@ -1791,7 +1806,7 @@ See the fi_ep_bind section above for more detail. \f[I]FI_DELIVERY_COMPLETE\f[R] Indicates that a completion should be generated when the operation has been processed by the destination endpoint(s). -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_INJECT\f[R] Indicates that all outbound data buffers should be returned to the @@ -1806,7 +1821,7 @@ This limit is indicated using inject_size (see inject_size above). \f[I]FI_INJECT_COMPLETE\f[R] Indicates that a completion should be generated when the source buffer(s) may be reused. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_MULTICAST\f[R] Indicates that data transfers will target multicast addresses by @@ -1830,7 +1845,7 @@ space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). \f[I]FI_TRANSMIT_COMPLETE\f[R] Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -1839,10 +1854,10 @@ fabric endpoint. Endpoints allocated with the FI_CONTEXT or FI_CONTEXT2 mode bits set must typically provide struct fi_context(2) as their per operation context parameter. -(See fi_getinfo.3 for details.) However, when FI_SELECTIVE_COMPLETION is -enabled to suppress CQ completion entries, and an operation is initiated -without the FI_COMPLETION flag set, then the context parameter is -ignored. +(See fi_getinfo.3 for details.) +However, when FI_SELECTIVE_COMPLETION is enabled to suppress CQ +completion entries, and an operation is initiated without the +FI_COMPLETION flag set, then the context parameter is ignored. An application does not need to pass in a valid struct fi_context(2) into such data transfers. .PP @@ -1881,7 +1896,7 @@ submitted for processing. For fi_setopt/fi_getopt, a return value of -FI_ENOPROTOOPT indicates the provider does not support the requested option. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EDOMAIN\f[R] @@ -1895,8 +1910,8 @@ The endpoint has not been configured with necessary completion queue. The endpoint\[cq]s state does not permit the requested operation. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) -\f[C]fi_msg\f[R](3), \f[C]fi_tagged\f[R](3), \f[C]fi_rma\f[R](3) -\f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[V]fi_msg\f[R](3), \f[V]fi_tagged\f[R](3), \f[V]fi_rma\f[R](3) +\f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_eq.3 b/man/man3/fi_eq.3 index 249c086cc3b..7351e8f0cac 100644 --- a/man/man3/fi_eq.3 +++ b/man/man3/fi_eq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_eq" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_eq" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -109,7 +123,7 @@ as listening for connection requests. fi_eq_open allocates a new event queue. .PP The properties and behavior of an event queue are defined by -\f[C]struct fi_eq_attr\f[R]. +\f[V]struct fi_eq_attr\f[R]. .IP .nf \f[C] @@ -259,7 +273,7 @@ These include the following types of events: memory registration, address vector resolution, and multicast joins. .PP Control requests report their completion by inserting a -\f[C]struct fi_eq_entry\f[R] into the EQ. +\f[V]struct fi_eq_entry\f[R] into the EQ. The format of this structure is: .IP .nf @@ -283,7 +297,7 @@ The context field will be set to the context specified as part of the operation, if available, otherwise the context will be associated with the fabric descriptor. The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .TP @@ -293,7 +307,7 @@ setup or tear down connections between endpoints. There are three connection notification events: FI_CONNREQ, FI_CONNECTED, and FI_SHUTDOWN. Connection notifications are reported using -\f[C]struct fi_eq_cm_entry\f[R]: +\f[V]struct fi_eq_cm_entry\f[R]: .IP .nf \f[C] @@ -432,7 +446,7 @@ The context field will be set to the context specified as part of the operation. .PP The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .PP @@ -558,10 +572,10 @@ fi_eq_strerror Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_errno.3 b/man/man3/fi_errno.3 index dcac687918e..6175403b1ed 100644 --- a/man/man3/fi_errno.3 +++ b/man/man3/fi_errno.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_errno" "3" "2024\-03\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_errno" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -184,6 +198,6 @@ Receiver not ready, no receive buffers available Memory registration limit exceeded .SH SEE ALSO .PP -\f[C]fabric\f[R](7) +\f[V]fabric\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_fabric.3 b/man/man3/fi_fabric.3 index 3049e798a3d..f36f961a10b 100644 --- a/man/man3/fi_fabric.3 +++ b/man/man3/fi_fabric.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_fabric" "3" "2023\-09\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_fabric" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -146,11 +160,11 @@ The data parameter is ignored. .TP \f[I]FI_TYPE_EQ_EVENT\f[R] uint32_t event parameter returned from fi_eq_read(). -See \f[C]fi_eq(3)\f[R] for a list of known values. +See \f[V]fi_eq(3)\f[R] for a list of known values. .TP \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[R] uint64_t flags field in fi_cq_xxx_entry structures. -See \f[C]fi_cq(3)\f[R] for valid flags. +See \f[V]fi_cq(3)\f[R] for valid flags. .TP \f[I]FI_TYPE_MR_MODE\f[R] struct fi_domain_attr::mr_mode flags @@ -245,7 +259,7 @@ these environment variables in a production setting. Version information for the fabric provider, in a major.minor format. The use of the FI_MAJOR() and FI_MINOR() version macros may be used to extract the major and minor version data. -See \f[C]fi_version(3)\f[R]. +See \f[V]fi_version(3)\f[R]. .PP In case of an utility provider layered over a core provider, the version would always refer to that of the utility provider. @@ -253,16 +267,16 @@ would always refer to that of the utility provider. .PP The interface version requested by the application. This value corresponds to the version parameter passed into -\f[C]fi_getinfo(3)\f[R]. +\f[V]fi_getinfo(3)\f[R]. .SH RETURN VALUE .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_endpoint\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_endpoint\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 9d2c8496612..a0faf6d1121 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_getinfo" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_getinfo" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -218,8 +232,8 @@ manner. The fi_info::handle field is also used by fi_endpoint() and fi_reject() calls when processing connection requests or to inherit another endpoint\[cq]s attributes. -See \f[C]fi_eq\f[R](3), \f[C]fi_reject\f[R](3), and -\f[C]fi_endpoint\f[R](3). +See \f[V]fi_eq\f[R](3), \f[V]fi_reject\f[R](3), and +\f[V]fi_endpoint\f[R](3). The info->handle field will be ignored by fi_dupinfo and fi_freeinfo. .TP \f[I]tx_attr - transmit context attributes\f[R] @@ -252,7 +266,7 @@ set. On output, the actual endpoint attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_endpoint\f[R](3) for details. +See \f[V]fi_endpoint\f[R](3) for details. .TP \f[I]domain_attr - domain attributes\f[R] Optionally supplied domain attributes. @@ -262,7 +276,7 @@ be set. On output, the actual domain attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_domain\f[R](3) for details. +See \f[V]fi_domain\f[R](3) for details. .TP \f[I]fabric_attr - fabric attributes\f[R] Optionally supplied fabric attributes. @@ -271,14 +285,14 @@ When provided as hints, requested values of struct fi_fabric_attr should be set. On output, the actual fabric attributes that can be provided will be returned. -See \f[C]fi_fabric\f[R](3) for details. +See \f[V]fi_fabric\f[R](3) for details. .TP \f[I]nic - network interface details\f[R] Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are closely associated with a hardware NIC. -See \f[C]fi_nic\f[R](3) for details. +See \f[V]fi_nic\f[R](3) for details. .SH CAPABILITIES .PP Interface capabilities are obtained by OR-ing the following flags @@ -310,12 +324,12 @@ Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. -See \f[C]fi_domain\f[R](3) and \f[C]fi_av\f[R](3) for more details. +See \f[V]fi_domain\f[R](3) and \f[V]fi_av\f[R](3) for more details. .TP \f[I]FI_COLLECTIVE\f[R] Requests support for collective operations. Endpoints that support this capability support the collective operations -defined in \f[C]fi_collective\f[R](3). +defined in \f[V]fi_collective\f[R](3). .TP \f[I]FI_DIRECTED_RECV\f[R] Requests that the communication endpoint use the source address of an @@ -323,6 +337,17 @@ incoming message when matching it with a receive buffer. If this capability is not set, then the src_addr parameter for msg and tagged receive operations is ignored. .TP +\f[I]FI_TAGGED_DIRECTED_RECV\f[R] +Similar to FI_DIRECTED_RECV, but only applies to tagged receive +operations. +.TP +\f[I]FI_EXACT_DIRECTED_RECV\f[R] +Similar to FI_DIRECTED_RECV, but requires the source address to be +exact, i.e., FI_ADDR_UNSPEC is not allowed. +This capability can be used alone, or in conjunction with +FI_DIRECTED_RECV or FI_TAGGED_DIRECTED_RECV as a modifier to disallow +FI_ADDR_UNSPEC being used as the source address. +.TP \f[I]FI_FENCE\f[R] Indicates that the endpoint support the FI_FENCE flag on data transfer operations. @@ -372,6 +397,10 @@ send-only or receive-only. Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. .TP +\f[I]FI_TAGGED_MULTI_RECV\f[R] +Specifies that the endpoint must support the FI_MULTI_RECV flag when +posting tagged receive buffers. +.TP \f[I]FI_NAMED_RX_CTX\f[R] Requests that endpoints which support multiple receive contexts allow an initiator to target (or name) a specific receive context as part of a @@ -482,7 +511,7 @@ endpoint as send-only or receive-only. \f[I]FI_TRIGGER\f[R] Indicates that the endpoint should support triggered operations. Endpoints support this capability must meet the usage model as described -by \f[C]fi_trigger\f[R](3). +by \f[V]fi_trigger\f[R](3). .TP \f[I]FI_WRITE\f[R] Indicates that the user requires an endpoint capable of initiating @@ -493,7 +522,7 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. Specifies that the endpoint should support transfers that may be initiated from heterogenous computation devices, such as GPUs. This flag requires that FI_TRIGGER be set. -For additional details on XPU triggers see \f[C]fi_trigger\f[R](3). +For additional details on XPU triggers see \f[V]fi_trigger\f[R](3). .PP Capabilities may be grouped into three general categories: primary, secondary, and primary modifiers. @@ -513,15 +542,16 @@ A provider may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. .PP Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, -FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, -FI_XPU, FI_AV_USER_ID, FI_PEER +FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID, +FI_PEER .PP Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE .PP -Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, -FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, -FI_SOURCE_ERR, FI_RMA_PMEM. +Secondary capabilities: FI_MULTI_RECV, FI_TAGGED_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, +FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. .SH MODE .PP The operational mode bits are used to convey requirements that an @@ -596,8 +626,8 @@ application for access domains opened with this capability. This flag is defined for compatibility and is ignored if the application version is 1.5 or later and the domain mr_mode is set to anything other than FI_MR_BASIC or FI_MR_SCALABLE. -See the domain attribute mr_mode \f[C]fi_domain\f[R](3) and -\f[C]fi_mr\f[R](3). +See the domain attribute mr_mode \f[V]fi_domain\f[R](3) and +\f[V]fi_mr\f[R](3). .TP \f[I]FI_MSG_PREFIX\f[R] Message prefix mode indicates that an application will provide buffer @@ -657,7 +687,7 @@ these operations. A provider may support one or more of the following addressing formats. In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_ADDR_EFA\f[R] Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. @@ -745,7 +775,7 @@ This flag is often used with passive endpoints. fi_getinfo() returns 0 on success. On error, fi_getinfo() returns a negative value corresponding to fabric errno. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP fi_allocinfo() returns a pointer to a new fi_info structure on success, or NULL on error. @@ -800,11 +830,11 @@ by fi_getinfo. If neither node, service or hints are provided, then fi_getinfo simply returns the list all available communication interfaces. .PP -Multiple threads may call \f[C]fi_getinfo\f[R] simultaneously, without +Multiple threads may call \f[V]fi_getinfo\f[R] simultaneously, without any requirement for serialization. .SH SEE ALSO .PP -\f[C]fi_open\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_nic\f[R](3) \f[C]fi_trigger\f[R](3) +\f[V]fi_open\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_nic\f[R](3) \f[V]fi_trigger\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index f9ffbc1e841..4d11d894ab9 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mr" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mr" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -138,6 +152,17 @@ mode bits, specified through the domain attributes (mr_mode field). Each mr_mode bit requires that an application take specific steps in order to use memory buffers with libfabric interfaces. .PP +As a special case, a new memory region can be created from an existing +memory region. +Such a new memory region is called a sub-MR, and the existing memory +region is called the base MR. +Sub-MRs may be used to shared hardware resources, such as virtual to +physical address translations and page pinning. +This can improve performance when creating and destroying sub-regions +that need different access rights. +The base MR itself can also be a sub-MR, allowing for a hierarchy of +memory regions. +.PP The following apply to memory registration. .TP \f[I]Default Memory Registration\f[R] @@ -616,8 +641,8 @@ passed directly into calls as function parameters. \f[C] struct fi_mr_attr { union { - const struct iovec *mr_iov; - const struct fi_mr_dmabuf *dmabuf; + const struct iovec *mr_iov; + const struct fi_mr_dmabuf *dmabuf; }; size_t iov_count; uint64_t access; @@ -636,6 +661,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_auth_key { @@ -690,7 +717,7 @@ specifying the FI_MR_DMABUF flag. The number of entries in the mr_iov array. The maximum number of memory buffers that may be associated with a single memory region is specified as the mr_iov_limit domain attribute. -See \f[C]fi_domain(3)\f[R]. +See \f[V]fi_domain(3)\f[R]. .SS access .PP Indicates the type of \f[I]operations\f[R] that the local or a peer @@ -772,7 +799,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key_size must equal -\f[C]sizeof(struct fi_mr_auth_key)\f[R]. +\f[V]sizeof(struct fi_mr_auth_key)\f[R]. .SS auth_key .PP Indicates the key to associate with this memory registration. @@ -785,7 +812,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key must point to a -user-defined \f[C]struct fi_mr_auth_key\f[R]. +user-defined \f[V]struct fi_mr_auth_key\f[R]. .SS iface .PP Indicates the software interfaces used by the application to allocate @@ -866,6 +893,32 @@ failed transfers to or from the region. .PP Providers may choose to ignore page size. This will result in a provider selected page size always being used. +.SS base_mr +.PP +If non-NULL, create a sub-MR from an existing memory region specified by +the base_mr field. +.PP +The sub-MR must be fully contained within the base MR; however, the +sub-MR has its own authorization keys and access rights. +The following attributes are inherited from the base MR, and as a +result, are ignored when creating the sub-MR: +.PP +iface, device, hmem_data, page_size +.PP +The sub-MR should hold a reference to the base MR. +When fi_close is called on the base MR, the call would fail if there are +any outstanding sub-MRs. +.PP +The base_mr field must be NULL if the FI_MR_DMABUF flag is set. +.SS sub_mr_cnt +.PP +The number of sub-MRs expected to be created from the memory region. +This value is not a limit. +Instead, it is a hint to the provider to allow provider specific +optimization for sub-MR creation. +For example, the provider may reserve access keys or pre-allocation +fid_mr objects. +The provider may ignore this hint. .SS fi_hmem_ze_device .PP Returns an hmem device identifier for a level zero @@ -888,7 +941,7 @@ keys in the AV. .PP If the domain was opened with FI_DIRECTED_RECV, addr can be used to limit the memory region to a specific fi_addr_t, including -fi_addr_t\[cq]s return from \f[C]fi_av_insert_auth_key\f[R]. +fi_addr_t\[cq]s return from \f[V]fi_av_insert_auth_key\f[R]. .SH NOTES .PP Direct access to an application\[cq]s memory by a remote peer requires @@ -965,6 +1018,13 @@ fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. .TP +\f[I]FI_MR_SINGLE_USE\f[R] +This flag indicates that the memory region is only used for a single +operation. +After the operation is complete, the key associated with the memory +region is automatically invalidated and can no longer be used for remote +access. +.TP \f[I]FI_AUTH_KEY\f[R] Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the @@ -1011,7 +1071,7 @@ For example, the physical pages referenced by a virtual address range could migrate between host memory and GPU memory, depending on which computational unit is actively using it. .PP -See the \f[C]fi_endpoint\f[R](3) and \f[C]fi_cq\f[R](3) man pages for +See the \f[V]fi_endpoint\f[R](3) and \f[V]fi_cq\f[R](3) man pages for addition discussion on message, data, and completion ordering semantics, including the impact of memory domains. .SH RETURN VALUES @@ -1019,7 +1079,7 @@ including the impact of memory domains. Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_ENOKEY\f[R] @@ -1083,7 +1143,7 @@ Setting this to zero will disable registration caching. The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an application and the underlying physical pages. -Valid monitor options are: userfaultfd, memhooks, and disabled. +Valid monitor options are: userfaultfd, memhooks, kdreg2, and disabled. Selecting disabled will turn off the registration cache. Userfaultfd is a Linux kernel feature used to report virtual to physical address mapping changes to user space. @@ -1092,6 +1152,7 @@ deallocation calls which may result in the mappings changing, such as malloc, mmap, free, etc. Note that memhooks operates at the elf linker layer, and does not use glibc memory hooks. +Kdreg2 is supplied as a loadable Linux kernel module. .TP \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[R] The CUDA cache monitor is responsible for detecting CUDA device memory @@ -1125,8 +1186,8 @@ Some level of control over the cache is possible through the above mentioned environment variables. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_rma\f[R](3), \f[C]fi_msg\f[R](3), -\f[C]fi_atomic\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_rma\f[R](3), \f[V]fi_msg\f[R](3), +\f[V]fi_atomic\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index 0fe3a855391..708288ee5bc 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_msg" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_msg" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -65,7 +79,7 @@ Count of vectored data entries. .TP \f[I]desc\f[R] Descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent message. @@ -142,7 +156,7 @@ parameter to a remote endpoint as a single message. The fi_sendmsg call supports data transfers over both connected and connectionless endpoints, with the ability to control the send operation per call through the use of flags. -The fi_sendmsg function takes a \f[C]struct fi_msg\f[R] as input. +The fi_sendmsg function takes a \f[V]struct fi_msg\f[R] as input. .IP .nf \f[C] @@ -172,6 +186,12 @@ of the endpoint is to write CQ entries for all successful completions. See the flags discussion below for more details. The requested message size that can be used with fi_inject is limited by inject_size. +.PP +If FI_HMEM is enabled, the fi_inject call can only accept buffer with +iface equal to FI_HMEM_SYSTEM if the provider requires the FI_MR_HMEM +mr_mode. +This limitation applies to all the fi_*inject* calls and does not affect +how inject_size is reported. .SS fi_senddata .PP The send data call is similar to fi_send, but allows for the sending of @@ -259,7 +279,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_sendmsg. @@ -274,7 +294,7 @@ tracked by the provider. For receive operations, indicates that a completion may be generated as soon as the message has been processed by the local provider, even if the message data may not be visible to all processing elements. -See \f[C]fi_cq\f[R](3) for target side completion semantics. +See \f[V]fi_cq\f[R](3) for target side completion semantics. .TP \f[I]FI_DELIVERY_COMPLETE\f[R] Applies to fi_sendmsg. @@ -320,7 +340,7 @@ buffer length. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP See the discussion below for details handling FI_EAGAIN. .SH ERRORS @@ -353,7 +373,7 @@ acknowledgements or flow control messages may need to be processed in order to resume execution. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_nic.3 b/man/man3/fi_nic.3 index fdfeb9f9aa7..1a8eab67a09 100644 --- a/man/man3/fi_nic.3 +++ b/man/man3/fi_nic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_nic" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_nic" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -9,7 +23,7 @@ fi_nic - Fabric network interface card attributes .PP The fid_nic structure defines attributes for a struct fi_info that is directly associated with underlying networking hardware and may be -returned directly from calling \f[C]fi_getinfo\f[R](3). +returned directly from calling \f[V]fi_getinfo\f[R](3). The format of fid_nic and the related substructures are defined below. .PP Note that not all fields of all structures may be available. @@ -135,7 +149,7 @@ Ethernet or InfiniBand. .PP Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. -The attributes can be interpreted by \f[C]fi_tostr\f[R](3). +The attributes can be interpreted by \f[V]fi_tostr\f[R](3). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. @@ -145,10 +159,10 @@ specific header file included with libfabric package. .SH NOTES .PP The fid_nic structure is returned as part of a call to -\f[C]fi_getinfo\f[R](3). -It is automatically freed as part of calling \f[C]fi_freeinfo\f[R](3) +\f[V]fi_getinfo\f[R](3). +It is automatically freed as part of calling \f[V]fi_freeinfo\f[R](3) .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3) +\f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_peer.3 b/man/man3/fi_peer.3 index 044390d40a7..8661ec75b72 100644 --- a/man/man3/fi_peer.3 +++ b/man/man3/fi_peer.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_peer" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_peer" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -85,6 +99,24 @@ However, because the goal of using peer providers is to avoid overhead, providers must be explicitly written to support the peer provider mechanisms. .PP +When importing any shared fabric object into a peer, the owner will +create a separate fid_peer_* for each peer provider it intends to import +into. +The owner will pass this unique fid_peer_* into each peer through the +context parameter of the init call for the resource (i.e.\ fi_cq_open, +fi_srx_context, fi_cntr_open, etc). +The fi_peer_\f[I]\f[R]context will indicate the owner-allocated +fid_peer\f[I]\f[R] for the peer to use but is temporary for the init +call and may not be accessed by the peer after initialization. +The peer will set just the peer_ops of the owner-allocated fid and save +a reference to the imported fid_peer_* for use in the peer API flow. +The peer will allocate its own fid for internal uses and return that fid +to the owner through the regular fid parameter of the init call (as if +it were just another opened resource). +The owner is responsible for saving the returned peer fid from the open +call in order to close it later (or to drive progress in the case of the +cq_fid). +.PP There are two peer provider models. In the example listed above, both peers are full providers in their own right and usable in a stand-alone fashion. @@ -255,8 +287,8 @@ If manual progress is needed on the peer CQ, the owner should drive progress by using the fi_cq_read() function with the buf parameter set to NULL and count equal 0. The peer provider should set other functions that attempt to read the -peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) to return --FI_ENOSYS. +peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) +to return -FI_ENOSYS. .SS fi_ops_cq_owner::write() .PP This call directs the owner to insert new completions into the CQ. @@ -347,8 +379,8 @@ Similar to the peer CQ, if manual progress is needed on the peer counter, the owner should drive progress by using the fi_cntr_read() and the fi_cntr_read() should do nothing but progress the peer cntr. The peer provider should set other functions that attempt to access the -peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) to return --FI_ENOSYS. +peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) +to return -FI_ENOSYS. .SS fi_ops_cntr_owner::inc() .PP This call directs the owner to increment the value of the cntr. @@ -783,9 +815,9 @@ callbacks. .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_provider\f[R](7), \f[C]fi_provider\f[R](3), \f[C]fi_cq\f[R](3), +\f[V]fi_provider\f[R](7), \f[V]fi_provider\f[R](3), \f[V]fi_cq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_poll.3 b/man/man3/fi_poll.3 index 285965f3589..b689bf59598 100644 --- a/man/man3/fi_poll.3 +++ b/man/man3/fi_poll.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_poll" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_poll" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -315,7 +329,7 @@ or fid. Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .TP fi_poll On success, if events are available, returns the number of entries @@ -392,7 +406,7 @@ The use of the fi_trywait() function is still required if accessing wait objects directly. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_profile.3 b/man/man3/fi_profile.3 index 3476bc77ddd..3eaa8532bb3 100644 --- a/man/man3/fi_profile.3 +++ b/man/man3/fi_profile.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_profile" "3" "2023\-10\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_profile" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -323,6 +337,6 @@ be returned. For fi_profile_query_vars and fi_profile_query_events, a positive return value indicates the number of variables or events returned in the list. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_provider.3 b/man/man3/fi_provider.3 index 50166da32ca..41e0289f423 100644 --- a/man/man3/fi_provider.3 +++ b/man/man3/fi_provider.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -183,7 +197,7 @@ the service or resource to which they correspond. The mr_cache object references the internal memory registration cache used by the different providers. Additional information on the cache is available in the -\f[C]fi_mr(3)\f[R] man page. +\f[V]fi_mr(3)\f[R] man page. .TP \f[I]logging\f[R] The logging object references the internal logging subsystem used by the @@ -193,8 +207,8 @@ Can be opened only once and only the last import is used if imported multiple times. .SS fi_import .PP -This helper function is a combination of \f[C]fi_open\f[R] and -\f[C]fi_import_fid\f[R]. +This helper function is a combination of \f[V]fi_open\f[R] and +\f[V]fi_import_fid\f[R]. It may be used to import a fabric object created and owned by the libfabric user. This allows the upper level libraries or the application to override or @@ -264,9 +278,9 @@ For integrated providers .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) \f[C]fi_mr\f[R](3), +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) \f[V]fi_mr\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_rma.3 b/man/man3/fi_rma.3 index f9abc8b5ba7..39f2d3a52ec 100644 --- a/man/man3/fi_rma.3 +++ b/man/man3/fi_rma.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rma" "3" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rma" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -78,7 +92,7 @@ FI_MR_SCALABLE. Protection key associated with the remote memory. .TP \f[I]desc\f[R] -Descriptor associated with the local data buffer See \f[C]fi_mr\f[R](3). +Descriptor associated with the local data buffer See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the operation. @@ -175,7 +189,7 @@ struct fi_rma_iov { .PP The write inject call is an optimized version of fi_write. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_writedata .PP The write data call is similar to fi_write, but allows for the sending @@ -276,15 +290,15 @@ operation (inclusive) to the posting of a subsequent fenced operation .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 62d48a21298..32624c6fc5f 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tagged" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tagged" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,7 +88,7 @@ Mask of bits to ignore applied to the tag for receive operations. .TP \f[I]desc\f[R] Memory descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent data. @@ -199,7 +213,7 @@ struct fi_msg_tagged { .PP The tagged inject call is an optimized version of fi_tsend. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_tsenddata .PP The tagged send data call is similar to fi_tsend, but allows for the @@ -269,6 +283,26 @@ This may require that the underlying provider implementation copy the data into a local buffer and transfer out of that buffer. This flag can only be used with messages smaller than inject_size. .TP +\f[I]FI_MULTI_RECV\f[R] +Applies to posted tagged receive operations when the +FI_TAGGED_MULTI_RECV capability is enabled. +This flag allows the user to post a single tagged receive buffer that +will receive multiple incoming messages. +Received messages will be packed into the receive buffer until the +buffer has been consumed. +Use of this flag may cause a single posted receive operation to generate +multiple events as messages are placed into the buffer. +The placement of received data into the buffer may be subjected to +provider specific alignment restrictions. +.PP +The buffer will be released by the provider when the available buffer +space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). +Note that an entry to the associated receive completion queue will +always be generated when the buffer has been consumed, even if other +receive completions have been suppressed (i.e.\ the Rx context has been +configured for FI_SELECTIVE_COMPLETION). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). +.TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_tsendmsg. Indicates that a completion should be generated when the source @@ -301,7 +335,8 @@ operation (inclusive) to the posting of a subsequent fenced operation .TP \f[I]FI_AUTH_KEY\f[R] Only valid with domains configured with FI_AV_AUTH_KEY and -connectionless endpoints configured with FI_DIRECTED_RECV. +connectionless endpoints configured with FI_DIRECTED_RECV or +FI_TAGGED_DIRECTED_RECV. When used with fi_trecvmsg, this flag denotes that the src_addr is an authorization key fi_addr_t instead of an endpoint fi_addr_t. .PP @@ -360,11 +395,11 @@ ignored. The tagged send and receive calls return 0 on success. On error, a negative value corresponding to fabric \f[I]errno \f[R] is returned. -Fabric errno values are defined in \f[C]fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EINVAL\f[R] @@ -374,7 +409,7 @@ Indicates that an invalid argument was supplied by the user. Indicates that an unspecified error occurred. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index 68586cd026a..0e18caa6399 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_trigger" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_trigger" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -199,7 +213,7 @@ If a specific request is not supported by the provider, it will fail the operation with -FI_ENOSYS. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_mr\f[R](3), -\f[C]fi_alias\f[R](3), \f[C]fi_cntr\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_mr\f[R](3), +\f[V]fi_alias\f[R](3), \f[V]fi_cntr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_version.3 b/man/man3/fi_version.3 index c188f06b210..cb94827f08c 100644 --- a/man/man3/fi_version.3 +++ b/man/man3/fi_version.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_version" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_version" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -34,6 +48,6 @@ The upper 16-bits of the version correspond to the major number, and the lower 16-bits correspond with the minor number. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index 0d8f5686769..49c45d804c5 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabric" "7" "2024\-03\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabric" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -16,7 +30,7 @@ fabric - Fabric Interface Library Libfabric is a high-performance fabric software library designed to provide low-latency interfaces to fabric hardware. For an in-depth discussion of the motivation and design see -\f[C]fi_guide\f[R](7). +\f[V]fi_guide\f[R](7). .SH OVERVIEW .PP Libfabric provides `process direct I/O' to application software @@ -37,7 +51,7 @@ All fabric hardware devices and their software drivers are required to support this framework. Devices and the drivers that plug into the libfabric framework are referred to as fabric providers, or simply providers. -Provider details may be found in \f[C]fi_provider\f[R](7). +Provider details may be found in \f[V]fi_provider\f[R](7). .TP \f[I]Fabric Interfaces\f[R] The second component is a set of communication operations. @@ -282,18 +296,18 @@ If the list begins with the `\[ha]' symbol, then the list will be negated. .PP Example: To enable the udp and tcp providers only, set: -\f[C]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] +\f[V]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] .PP When libfabric is installed, DL providers are put under the \f[I]default provider path\f[R], which is determined by how libfabric is built and installed. Usually the default provider path is -\f[C]/lib/libfabric\f[R] or -\f[C]/lib64/libfabric\f[R]. +\f[V]/lib/libfabric\f[R] or +\f[V]/lib64/libfabric\f[R]. By default, libfabric tries to find DL providers in the following order: .IP "1." 3 Use `dlopen' to load provider libraries named -\f[C]lib-fi.so\f[R] for all providers enabled at build time. +\f[V]lib-fi.so\f[R] for all providers enabled at build time. The search path of `ld.so' is used to locate the files. This step is skipped if libfabric is configured with the option `\[en]enable-restricted-dl'. @@ -363,7 +377,7 @@ can be used to retrieve information about which providers are available in the system. Additionally, it can retrieve a list of all environment variables that may be used to configure libfabric and each provider. -See \f[C]fi_info\f[R](1) for more details. +See \f[V]fi_info\f[R](1) for more details. .SH ENVIRONMENT VARIABLE CONTROLS .PP Core features of libfabric and its providers may be configured by an @@ -400,22 +414,22 @@ may not be available in a child process because of copy on write restrictions. .SS CUDA deadlock .PP -In some cases, calls to \f[C]cudaMemcpy()\f[R] within libfabric may +In some cases, calls to \f[V]cudaMemcpy()\f[R] within libfabric may result in a deadlock. This typically occurs when a CUDA kernel blocks until a -\f[C]cudaMemcpy\f[R] on the host completes. +\f[V]cudaMemcpy\f[R] on the host completes. Applications which can cause such behavior can restrict Libfabric\[cq]s ability to invoke CUDA API operations with the endpoint option -\f[C]FI_OPT_CUDA_API_PERMITTED\f[R]. -See \f[C]fi_endpoint\f[R](3) for more details. +\f[V]FI_OPT_CUDA_API_PERMITTED\f[R]. +See \f[V]fi_endpoint\f[R](3) for more details. .PP Another mechanism which can be used to avoid deadlock is Nvidia\[cq]s GDRCopy. Using GDRCopy requires an external library and kernel module available at https://github.com/NVIDIA/gdrcopy. Libfabric must be configured with GDRCopy support using the -\f[C]--with-gdrcopy\f[R] option, and be run with -\f[C]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. +\f[V]--with-gdrcopy\f[R] option, and be run with +\f[V]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. This may not be supported by all providers. .SH ABI CHANGES .PP @@ -499,11 +513,19 @@ Added new fields to the following attributes: .TP \f[I]fi_domain_attr\f[R] Added max_ep_auth_key +.SS ABI 1.8 +.PP +ABI version starting with libfabric 2.0. +Added new fi_fabric2 API call. +Added new fields to the following attributes: +.TP +\f[I]fi_domain_attr\f[R] +Added max_group_id .SH SEE ALSO .PP -\f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3), -\f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_mr\f[R](3) +\f[V]fi_info\f[R](1), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_mr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_arch.7 b/man/man7/fi_arch.7 index 7a749d4fca1..21dc5ee4b21 100644 --- a/man/man7/fi_arch.7 +++ b/man/man7/fi_arch.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_arch" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_arch" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .IP .nf diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index c0ad9d32a0d..e6b9c6717e3 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cxi" "7" "2024\-03\-21" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cxi" "7" "2025\-01\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -83,6 +97,8 @@ types. FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve the lowest latency and reduce interrupt overhead. +NOTE: A process may return from a epoll_wait/poll when provider progress +is required and a CQ event may not be available. .SS Additional Features .PP The CXI provider also supports the following capabilities and features: @@ -176,7 +192,7 @@ Classes. .PP While a libfabric user provided authorization key is optional, it is highly encouraged that libfabric users provide an authorization key -through the domain attribute hints during \f[C]fi_getinfo()\f[R]. +through the domain attribute hints during \f[V]fi_getinfo()\f[R]. How libfabric users acquire the authorization key may vary between the users and is outside the scope of this document. .PP @@ -192,18 +208,18 @@ authorization key using them. .IP \[bu] 2 \f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs. The CXI provider will only use the first VNI if multiple are provide. -Example: \f[C]SLINGSHOT_VNIS=234\f[R]. +Example: \f[V]SLINGSHOT_VNIS=234\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names. Each device index will use the same index to lookup the service ID in \f[I]SLINGSHOT_SVC_IDS\f[R]. -Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. +Example: \f[V]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI service IDs. Each service ID index will use the same index to lookup the CXI device in \f[I]SLINGSHOT_DEVICES\f[R]. -Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R]. +Example: \f[V]SLINGSHOT_SVC_IDS=5,6\f[R]. .PP \f[B]Note:\f[R] How valid VNIs and device services are configured is outside the responsibility of the CXI provider. @@ -442,53 +458,25 @@ hybrid RX match modes increase Request buffer space using the variables \f[I]FI_CXI_REQ_*\f[R]. .SS Message Ordering .PP -The CXI provider supports the following ordering rules: -.IP \[bu] 2 -All message Send operations are always ordered. -.IP \[bu] 2 -RMA Writes may be ordered by specifying \f[I]FI_ORDER_RMA_WAW\f[R]. -.IP \[bu] 2 -AMOs may be ordered by specifying -\f[I]FI_ORDER_AMO_{WAW|WAR|RAW|RAR}\f[R]. -.IP \[bu] 2 -RMA Writes may be ordered with respect to AMOs by specifying -\f[I]FI_ORDER_WAW\f[R]. -Fetching AMOs may be used to perform short reads that are ordered with -respect to RMA Writes. +Supported message ordering: FI_ORDER_SAS, FI_ORDER_WAW, +FI_ORDER_RMA_WAW, FI_ORDER_RMA_RAR, FI_ORDER_ATOMIC_WAW, and +FI_ORDER_ATOMIC_RAR. +.PP +Note: Any FI_ORDER_*_{WAR,RAW} are not supported. +.PP +Note: Relaxing the message ordering may result in improved performance. +.SS Target Ordering .PP Ordered RMA size limits are set as follows: .IP \[bu] 2 \f[I]max_order_waw_size\f[R] is -1. -RMA Writes and non-fetching AMOs of any size are ordered with respect to -each other. -.IP \[bu] 2 -\f[I]max_order_raw_size\f[R] is -1. -Fetching AMOs of any size are ordered with respect to RMA Writes and -non-fetching AMOs. -.IP \[bu] 2 -\f[I]max_order_war_size\f[R] is -1. -RMA Writes and non-fetching AMOs of any size are ordered with respect to -fetching AMOs. -.SS PCIe Ordering -.PP -Generally, PCIe writes are strictly ordered. -As an optimization, PCIe TLPs may have the Relaxed Order (RO) bit set to -allow writes to be reordered. -Cassini sets the RO bit in PCIe TLPs when possible. -Cassini sets PCIe RO as follows: -.IP \[bu] 2 -Ordering of messaging operations is established using completion events. -Therefore, all PCIe TLPs related to two-sided message payloads will have -RO set. -.IP \[bu] 2 -Every PCIe TLP associated with an unordered RMA or AMO operation will -have RO cleared. -.IP \[bu] 2 -PCIe TLPs associated with the last packet of an ordered RMA or AMO -operation will have RO cleared. -.IP \[bu] 2 -PCIe TLPs associated with the body packets (all except the last packet -of an operation) of an ordered RMA operation will have RO set. +RMA Writes and AMO writes of any size are ordered with respect to each +other. +.PP +Note: Due to FI_ORDER_*_{WAR,RAW} not being supported, +max_order_{raw,war}_size are forced to zero. +.PP +Note: Relaxing the target ordering may result in improved performance. .SS Translation .PP The CXI provider supports two translation mechanisms: Address @@ -529,14 +517,14 @@ Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. .PP The CXI provider supports DMABUF for device memory registration. -If the ROCR and CUDA libraries support it, the CXI provider will default -to use DMA-buf. +DMABUF is supported in ROCm 5.6+ and Cuda 11.7+ with nvidia open source +driver 525+. +Both \f[I]FI_HMEM_ROCR_USE_DMABUF\f[R] and \f[I]FI_HMEM_CUDA_USE_DMABUF +are disabled by default in libfabric core but the CXI provider enables +\f[R]FI_HMEM_ROCR_USE_DMABUF* by default if not specifically set. There may be situations with CUDA that may double the BAR consumption. -Until this is fixed in the CUDA stack, the environment variable -\f[I]FI_CXI_DISABLE_DMABUF_CUDA\f[R] can be used to fall back to the -nvidia peer-memory interface. -Also, \f[I]FI_CXI_DISABLE_DMABUF_ROCR\f[R] can be used to fall back to -the amdgpu peer-memory interface. +Until this is fixed in the CUDA stack, CUDA DMABUF will be disabled by +default. .SS Translation Cache .PP Mapping a buffer for use by the NIC is an expensive operation. @@ -608,7 +596,7 @@ into the fi_control(FI_QUEUE_WORK) critical path. The following subsections outline the CXI provider fork support. .SS RDMA and Fork Overview .PP -Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW) +Under Linux, \f[V]fork()\f[R] is implemented using copy-on-write (COW) pages, so the only penalty that it incurs is the time and memory required to duplicate the parent\[cq]s page tables, mark all of the process\[cq]s page structs as read only and COW, and create a unique @@ -651,22 +639,22 @@ The crux of the issue is the parent issuing forks while trying to do RDMA operations to registered memory regions. Excluding software RDMA emulation, two options exist for RDMA NIC vendors to resolve this data corruption issue. -- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC +- Linux \f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC support for on-demand paging (ODP) .SS Linux madvise() MADV_DONTFORK and MADV_DOFORK .PP The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux COW fork policy and RDMA problem is to use the following -\f[C]madvise()\f[R] operations during memory registration and +\f[V]madvise()\f[R] operations during memory registration and deregistration: - MADV_DONTFORK: Do not make the pages in this range -available to the child after a \f[C]fork()\f[R]. +available to the child after a \f[V]fork()\f[R]. This is useful to prevent copy-on-write semantics from changing the physical location of a page if the parent writes to it after a -\f[C]fork()\f[R]. +\f[V]fork()\f[R]. (Such page relocations cause problems for hardware that DMAs into the -page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the -default behavior, whereby a mapping is inherited across -\f[C]fork()\f[R]. +page.) +- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default +behavior, whereby a mapping is inherited across \f[V]fork()\f[R]. .PP In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct (VMA) being marked with the VM_DONTCOPY flag. @@ -677,14 +665,14 @@ Should the child reference the virtual address corresponding to the VMA which was not duplicated, it will segfault. .PP In the previous example, if Process A issued -\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing +\f[V]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing RDMA memory registration, the physical address 0x1000 would have remained with Process A. This would prevent the Process A data corruption as well. If Process B were to reference virtual address 0xffff0000, it will segfault due to the hole in the virtual address space. .PP -Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for +Using \f[V]madvise()\f[R] with MADV_DONTFORK may be problematic for applications performing RDMA and page aliasing. Paging aliasing is where the parent process uses part or all of a page to share information with the child process. @@ -738,7 +726,7 @@ The CXI provider is subjected to the Linux COW fork policy and RDMA issues described in section \f[I]RDMA and Fork Overview\f[R]. To prevent data corruption with fork, the CXI provider supports the following options: - CXI specific fork environment variables to enable -\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* +\f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* .PP **Formal ODP support pending.* .SS CXI Specific Fork Environment Variables @@ -746,27 +734,27 @@ following options: - CXI specific fork environment variables to enable The CXI software stack has two environment variables related to fork: 0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable set, regardless of value, libcxi will -issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address +issue \f[V]madvise()\f[R] with MADV_DONTFORK on the virtual address range being registered for RDMA. -In addition, libcxi always align the \f[C]madvise()\f[R] to the system +In addition, libcxi always align the \f[V]madvise()\f[R] to the system default page size. On x86, this is 4 KiB. -To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK +To prevent redundant \f[V]madvise()\f[R] calls with MADV_DONTFORK against the same virtual address region, reference counting is used -against each tracked \f[C]madvise()\f[R] region. -In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R] +against each tracked \f[V]madvise()\f[R] region. +In addition, libcxi will spilt and merge tracked \f[V]madvise()\f[R] regions if needed. Once the reference count reaches zero, libcxi will call -\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region. +\f[V]madvise()\f[R] with MADV_DOFORK, and no longer track the region. - CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with CXI_FORK_SAFE, libcxi will not assume the page size is system default page size. -Instead, libcxi will walk \f[C]/proc//smaps\f[R] to determine the -correct page size and align the \f[C]madvise()\f[R] calls accordingly. +Instead, libcxi will walk \f[V]/proc//smaps\f[R] to determine the +correct page size and align the \f[V]madvise()\f[R] calls accordingly. This environment variable should be set if huge pages are being used for RDMA. To amortize the per memory registration walk of -\f[C]/proc//smaps\f[R], the libfabric MR cache should be used. +\f[V]/proc//smaps\f[R], the libfabric MR cache should be used. .PP Setting these environment variables will prevent data corruption when the parent issues a fork. @@ -800,7 +788,7 @@ transfer. The following is the CXI provider fork support guidance: - Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well. -Since enabling this will result in \f[C]madvice()\f[R] with +Since enabling this will result in \f[V]madvice()\f[R] with MADV_DONTFORK, the following steps should be taken to prevent a child process segfault: - Avoid using stack memory for RDMA - Avoid child process having to access a virtual address range the parent process is @@ -1172,6 +1160,11 @@ offloading are met. .PP The CXI provider checks for the following environment variables: .TP +\f[I]FI_CXI_MR_TARGET_ORDERING\f[R] +MR target ordering (i.e.\ PCI ordering). +Options: default, strict, or relaxed. +Recommendation is to leave at default behavior. +.TP \f[I]FI_CXI_ODP\f[R] Enables on-demand paging. If disabled, all DMA buffers are pinned. @@ -1551,6 +1544,18 @@ GPU direct RDMA may or may not work in this case. Enable enforcement of triggered operation limit. Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance. +.TP +\f[I]FI_CXI_MR_CACHE_EVENTS_DISABLE_POLL_NSECS\f[R] +Max amount of time to poll when disabling an MR configured with MR match +events. +.TP +\f[I]FI_CXI_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS\f[R] +Max amount of time to poll when LE invalidate disabling an MR configured +with MR match events. +.TP +\f[I]FI_CXI_FORCE_DEV_REG_COPY\f[R] +Force the CXI provider to use the HMEM device register copy routines. +If not supported, RDMA operations or memory registration will fail. .PP Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e @@ -1616,7 +1621,7 @@ It can only be changed prior to any MR being created. .PP CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_dom_ops\f[R]. See an example of usage below: .IP .nf @@ -1646,10 +1651,10 @@ struct fi_cxi_dom_ops { \f[R] .fi .PP -\f[I]cntr_read\f[R] extension is used to read hardware counter values. -Valid values of the cntr argument are found in the Cassini-specific -header file cassini_cntr_defs.h. -Note that Counter accesses by applications may be rate-limited to 1HZ. +\f[I]cntr_read\f[R] extension is used to read Cassini Telemetry items +that consists of counters and gauges. +The items available and their content are dependent upon the Cassini +ASIC version and Cassini Driver version. .PP \f[I]topology\f[R] extension is used to return CXI NIC address topology information for the domain. @@ -1709,7 +1714,7 @@ removed from the domain opts prior to software release 2.2. .PP CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_cntr_ops\f[R]. See an example of usage below. .IP .nf @@ -1824,12 +1829,6 @@ if (ret) \f[R] .fi .PP -When an endpoint does not support FI_FENCE (e.g.\ optimized MR), a -provider specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on -an alias EP to issue a FENCE operation to create a data ordering point -for the alias. -This is supported for one-sided operations only. -.PP Alias EP must be closed prior to closing the original EP. .SS PCIe Atomics .PP @@ -1838,7 +1837,7 @@ memory operation as a PCIe operation as compared to a NIC operation. The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this. .PP Since not all libfabric atomic memory operations can be executed as a -PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used +PCIe atomic memory operation, \f[V]fi_query_atomic()\f[R] could be used to query if a given libfabric atomic memory operation could be executed as PCIe atomic memory operation. .PP @@ -1888,7 +1887,7 @@ module parameter must be set to non-zero. .IP .nf \f[C] -/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +/sys/module/cxi_ss1/parameters/amo_remap_to_pcie_fadd \f[R] .fi .PP @@ -2156,6 +2155,6 @@ In this case, the target NIC is reachable. FI_EIO: Catch all errno. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_direct.7 b/man/man7/fi_direct.7 index 66415f928c5..a419ebf1931 100644 --- a/man/man7/fi_direct.7 +++ b/man/man7/fi_direct.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_direct" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_direct" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -71,7 +85,7 @@ The provider sets FI_LOCAL_MR for fi_info:mode. See fi_getinfo for additional details. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index 43214ca5ef3..6d6f780b3c7 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_efa" "7" "2024\-08\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_efa" "7" "2025\-01\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -42,8 +56,12 @@ maximum message size of the MTU of the underlying hardware (approximately 8 KiB). .TP \f[I]Address vectors\f[R] -The provider supports \f[I]FI_AV_TABLE\f[R] and \f[I]FI_AV_MAP\f[R] -address vector types. +The provider supports \f[I]FI_AV_TABLE\f[R]. +\f[I]FI_AV_MAP\f[R] was deprecated in Libfabric 2.x. +Applications can still use \f[I]FI_AV_MAP\f[R] to create an address +vector. +But the EFA provider implementation will print a warning and switch to +\f[I]FI_AV_TABLE\f[R]. \f[I]FI_EVENT\f[R] is unsupported. .TP \f[I]Completion events\f[R] @@ -95,7 +113,7 @@ No support for counters for the DGRAM endpoint. No support for inject. .SS zero-copy receive mode .IP \[bu] 2 -The receive operation cannot be cancelled via \f[C]fi_cancel()\f[R]. +The receive operation cannot be cancelled via \f[V]fi_cancel()\f[R]. .IP \[bu] 2 Zero-copy receive mode can be enabled only if SHM transfer is disabled. .IP \[bu] 2 @@ -166,12 +184,12 @@ If endpoint is not able to support this feature, it will return .PP The efa provider exports extensions for operations that are not provided by the standard libfabric interface. -These extensions are available via the \[lq]\f[C]fi_ext_efa.h\f[R]\[rq] +These extensions are available via the \[lq]\f[V]fi_ext_efa.h\f[R]\[rq] header file. .SS Domain Operation Extension .PP -Domain operation extension is obtained by calling \f[C]fi_open_ops\f[R] -(see \f[C]fi_domain(3)\f[R]) +Domain operation extension is obtained by calling \f[V]fi_open_ops\f[R] +(see \f[V]fi_domain(3)\f[R]) .IP .nf \f[C] @@ -180,9 +198,9 @@ int fi_open_ops(struct fid *domain, const char *name, uint64_t flags, \f[R] .fi .PP -and requesting \f[C]FI_EFA_DOMAIN_OPS\f[R] in \f[C]name\f[R]. -\f[C]fi_open_ops\f[R] returns \f[C]ops\f[R] as the pointer to the -function table \f[C]fi_efa_ops_domain\f[R] defined as follows: +and requesting \f[V]FI_EFA_DOMAIN_OPS\f[R] in \f[V]name\f[R]. +\f[V]fi_open_ops\f[R] returns \f[V]ops\f[R] as the pointer to the +function table \f[V]fi_efa_ops_domain\f[R] defined as follows: .IP .nf \f[C] @@ -224,24 +242,30 @@ FI_EFA_MR_ATTR_RDMA_RECV_IC_ID: rdma_recv_ic_id has a valid value. \f[I]recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for receive operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. .TP \f[I]rdma_read_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA read operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. .TP \f[I]rdma_recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA write receive. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. .SS Return value .PP \f[B]query_mr()\f[R] returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH Traffic Class (tclass) in EFA +.PP +To prioritize the messages from a given endpoint, user can specify +\f[V]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the +fi_endpoint() call to set the service level in rdma-core. +All other tclass values will be ignored. .SH RUNTIME PARAMETERS .TP \f[I]FI_EFA_IFACE\f[R] @@ -322,7 +346,7 @@ to a peer after a receiver not ready error. Enable SHM provider to provide the communication across all intra-node processes. SHM transfer will be disabled in the case where -\f[C]ptrace protection\f[R] is turned on. +\f[V]ptrace protection\f[R] is turned on. You can turn it off to enable shm transfer. .PP FI_EFA_ENABLE_SHM_TRANSFER is parsed during the fi_domain call and is @@ -415,8 +439,14 @@ Use device\[cq]s unsolicited write recv functionality when it\[cq]s available. (Default: 1). Setting this environment variable to 0 can disable this feature. +.TP +\f[I]FI_EFA_INTERNAL_RX_REFILL_THRESHOLD\f[R] +The threshold that EFA provider will refill the internal rx pkt pool. +(Default: 8). +When the number of internal rx pkts to post is lower than this +threshold, the refill will be skipped. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_guide.7 b/man/man7/fi_guide.7 index 9706838dfd4..2e9ac7ada01 100644 --- a/man/man7/fi_guide.7 +++ b/man/man7/fi_guide.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_guide" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_guide" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -20,16 +34,16 @@ This guide describes the libfabric architecture and interfaces. Due to the length of the guide, it has been broken into multiple pages. These sections are: .TP -\f[I]Introduction \f[BI]\f[CBI]fi_intro\f[BI]\f[I](7)\f[R] +\f[I]Introduction \f[VI]fi_intro\f[I](7)\f[R] This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. .TP -\f[I]Architecture \f[BI]\f[CBI]fi_arch\f[BI]\f[I](7)\f[R] +\f[I]Architecture \f[VI]fi_arch\f[I](7)\f[R] This describes the exposed architecture of libfabric, including the object-model and their related operations .TP -\f[I]Setup \f[BI]\f[CBI]fi_setup\f[BI]\f[I](7)\f[R] +\f[I]Setup \f[VI]fi_setup\f[I](7)\f[R] This provides basic bootstrapping and setup for using the libfabric API. .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_hook.7 b/man/man7/fi_hook.7 index cd628f4e22e..18eb9a9dbce 100644 --- a/man/man7/fi_hook.7 +++ b/man/man7/fi_hook.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_hook" "7" "2023\-04\-26" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_hook" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -166,6 +180,6 @@ Application that use FI_TRIGGER operations that attempt to hook calls will likely crash. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_intro.7 b/man/man7/fi_intro.7 index 97bca65b254..3a6dd2507ba 100644 --- a/man/man7/fi_intro.7 +++ b/man/man7/fi_intro.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_intro" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_intro" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -8,7 +22,7 @@ fi_intro - libfabric introduction .SH OVERVIEW .PP This introduction is part of the libfabric\[cq]s programmer\[cq]s guide. -See \f[C]fi_guide\f[R](7). +See \f[V]fi_guide\f[R](7). This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. @@ -1124,9 +1138,9 @@ If an application is using 1000 endpoints and posts 100 buffers, each 4 KB, that results in 400 MB of memory space being consumed to receive data. (We can start to realize that by eliminating memory copies, one of the -trade offs is increased memory consumption.) While 400 MB seems like a -lot of memory, there is less than half a megabyte allocated to a single -receive queue. +trade offs is increased memory consumption.) +While 400 MB seems like a lot of memory, there is less than half a +megabyte allocated to a single receive queue. At today\[cq]s networking speeds, that amount of space can be consumed within milliseconds. The result is that if only a few endpoints are in use, the application @@ -1415,6 +1429,6 @@ but it does allow for optimizing network utilization. Libfabric is well architected to support the previously discussed features. For further information on the libfabric architecture, see the next -programmer\[cq]s guide section: \f[C]fi_arch\f[R](7). +programmer\[cq]s guide section: \f[V]fi_arch\f[R](7). .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 new file mode 100644 index 00000000000..21ebfcbe09a --- /dev/null +++ b/man/man7/fi_lnx.7 @@ -0,0 +1,190 @@ +.\" Automatically generated by Pandoc 3.1.3 +.\" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_lnx" "7" "2024\-12\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.hy +.SH NAME +.PP +fi_lnx - The LINKx (LNX) Provider +.SH OVERVIEW +.PP +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.SH SUPPORTED FEATURES +.PP +This release contains an initial implementation of the LNX provider that +offers the following support: +.TP +\f[I]Endpoint types\f[R] +The provider supports only endpoint type \f[I]FI_EP_RDM\f[R]. +.TP +\f[I]Endpoint capabilities\f[R] +LNX is a passthrough layer on the send path. +On the receive path LNX utilizes the peer infrastructure to create +shared receive queues (SRQ). +Receive requests are placed on the SRQ instead of on the core provider +receive queue. +When the provider receives a message it queries the SRQ for a match. +If one is found the receive request is completed, otherwise the message +is placed on the LNX shared unexpected queue (SUQ). +Further receive requests query the SUQ for matches. +The first release of the provider only supports tagged and RMA +operations. +Other message types will be supported in future releases. +.TP +\f[I]Modes\f[R] +The provider does not require the use of any mode bits. +.TP +\f[I]Progress\f[R] +LNX utilizes the peer infrastructure to provide a shared completion +queue. +Each linked provider still needs to handle its own progress. +Completion events will however be placed on the shared completion queue, +which is passed to the application for access. +.TP +\f[I]Address Format\f[R] +LNX wraps the linked providers addresses in one common binary blob. +It does not alter or change the linked providers address format. +It wraps them into a LNX structure which is then flattened and returned +to the application. +This is passed between different nodes. +The LNX provider is able to parse the flattened format and operate on +the different links. +This assumes that nodes in the same group are all using the same version +of the provider with the exact same links. +IE: you can\[cq]t have one node linking SHM+CXI while another linking +SHM+RXM. +.TP +\f[I]Message Operations\f[R] +LNX is designed to intercept message operations such as fi_tsenddata and +based on specific criteria forward the operation to the appropriate +provider. +For the first release, LNX will only support linking SHM provider for +intra-node traffic and another provider (ex: CXI) for inter node +traffic. +LNX send operation looks at the destination and based on whether the +destination is local or remote it will select the provider to forward +the operation to. +The receive case has been described earlier. +.TP +\f[I]Using the Provider\f[R] +In order to use the provider the user needs to set FI_LNX_PROV_LINKS +environment variable to the linked providers in the following format +shm+. +This will allow LNX to report back to the application in the +fi_getinfo() call the different links which can be selected. +Since there are multiple domains per provider LNX reports a permutation +of all the possible links. +For example if there are two CXI interfaces on the machine LNX will +report back shm+cxi0 and shm+cxi1. +The application can then select based on its own criteria the link it +wishes to use. +The application typically uses the PCI information in the fi_info +structure to select the interface to use. +A common selection criteria is the interface nearest the core the +process is bound to. +In order to make this determination, the application requires the PCI +information about the interface. +For this reason LNX forwards the PCI information for the inter-node +provider in the link to the application. +.SH LIMITATIONS AND FUTURE WORK +.TP +\f[I]Hardware Support\f[R] +LNX doesn\[cq]t support hardware offload; ex hardware tag matching. +This is an inherit limitation when using the peer infrastructure. +Due to the use of a shared receive queue which linked providers need to +query when a message is received, any hardware offload which requires +sending the receive buffers to the hardware directly will not work with +the shared receive queue. +The shared receive queue provides two advantages; 1) reduce memory +usage, 2) coordinate the receive operations. +For #2 this is needed when receiving from FI_ADDR_UNSPEC. +In this case both providers which are part of the link can race to gain +access to the receive buffer. +It is a future effort to determine a way to use hardware tag matching +and other hardware offload capability with LNX +.TP +\f[I]Limited Linking\f[R] +This release of the provider supports linking SHM provider for +intra-node operations and another provider which supports the FI_PEER +capability for inter-node operations. +It is a future effort to expand to link any multiple sets of providers. +.TP +\f[I]Memory Registration\f[R] +As part of the memory registration operation, varying hardware can +perform hardware specific steps such as memory pinning. +Due to the fact that memory registration APIs do not specify the source +or destination addresses it is not possible for LNX to determine which +provider to forward the memory registration to. +LNX, therefore, registers the memory with all linked providers. +This might not be efficient and might have unforeseen side effects. +A better method is needed to support memory registration. +One option is to have memory registration cache in lnx to avoid +expensive operations. +.TP +\f[I]Operation Types\f[R] +This release of LNX supports tagged and RMA operations only. +Future releases will expand the support to other operation types. +.TP +\f[I]Multi-Rail\f[R] +Future design effort is being planned to support utilizing multiple +interfaces for traffic simultaneously. +This can be over homogeneous interfaces or over heterogeneous +interfaces. +.SH RUNTIME PARAMETERS +.PP +The \f[I]LNX\f[R] provider checks for the following environment +variables: +.TP +\f[I]FI_LNX_PROV_LINKS\f[R] +This environment variable is used to specify which providers to link. +This must be set in order for the LNX provider to return a list of +fi_info blocks in the fi_getinfo() call. +The format which must be used is: ++\&... +As mentioned earlier currently LNX supports linking only two providers +the first of which is SHM followed by one other provider for inter-node +operations +.TP +\f[I]FI_LNX_DISABLE_SHM\f[R] +By default this environment variable is set to 0. +However, the user can set it to one and then the SHM provider will not +be used. +This can be useful for debugging and performance analysis. +The SHM provider will naturally be used for all intra-node operations. +Therefore, to test SHM in isolation with LNX, the processes can be +limited to the same node only. +.TP +\f[I]FI_LNX_USE_SRQ\f[R] +Shared Receive Queues are integral part of the peer infrastructure, but +they have the limitation of not using hardware offload, such as tag +matching. +SRQ is needed to support the FI_ADDR_UNSPEC case. +If the application is sure this will never be the case, then it can turn +off SRQ support by setting this environment variable to 0. +It is 1 by default. +.SH SEE ALSO +.PP +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +.SH AUTHORS +OpenFabrics. diff --git a/man/man7/fi_lpp.7 b/man/man7/fi_lpp.7 index 9a7007308d3..1d09963ca44 100644 --- a/man/man7/fi_lpp.7 +++ b/man/man7/fi_lpp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_lpp" "7" "2024\-08\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_lpp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,6 +88,6 @@ Use the memcpy implementation in the system libc rather than provider-specific memcpy. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_mrail.7 b/man/man7/fi_mrail.7 index 97e3b44caee..8af1a0f16a7 100644 --- a/man/man7/fi_mrail.7 +++ b/man/man7/fi_mrail.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mrail" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mrail" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -83,18 +97,18 @@ Deprecated. Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[R]. .TP \f[I]FI_OFI_MRAIL_CONFIG\f[R] -Comma separated list of \f[C]:\f[R] pairs, sorted in -ascending order of \f[C]\f[R]. +Comma separated list of \f[V]:\f[R] pairs, sorted in +ascending order of \f[V]\f[R]. Each pair indicated the rail sharing policy to be used for messages up -to the size \f[C]\f[R] and not covered by all previous pairs. -The value of \f[C]\f[R] can be \f[I]fixed\f[R] (a fixed rail is +to the size \f[V]\f[R] and not covered by all previous pairs. +The value of \f[V]\f[R] can be \f[I]fixed\f[R] (a fixed rail is used), \f[I]round-robin\f[R] (one rail per message, selected in round-robin fashion), or \f[I]striping\f[R] (striping across all the rails). -The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[R]. +The default configuration is \f[V]16384:fixed,ULONG_MAX:striping\f[R]. The value ULONG_MAX can be input as -1. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index df7e5966147..e77be3efd17 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_opx" "7" "2024\-07\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_opx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -149,6 +163,22 @@ inclusive. .PP Default setting is 64. .TP +\f[I]FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS\f[R] +Integer. +This setting controls how many PING requests the reliability/replay +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +in situations with less contending outgoing traffic from the HFI. +Default setting is 128. +Range of valid values is 1-65535. +.TP +\f[I]FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS\f[R] +Integer. +This setting controls how many PING requests the reliability/replay +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +in situations with more contending, outgoing traffic from the HFI. +Default setting is 4. +Range of valid values is 1-65535. +.TP \f[I]FI_OPX_SELINUX\f[R] Boolean (0/1, on/off, true/false, yes/no). Set to true if you\[cq]re running a security-enhanced Linux. @@ -158,35 +188,35 @@ Defaults to \[lq]No\[rq] \f[I]FI_OPX_HFI_SELECT\f[R] String. Controls how OPX chooses which HFI to use when opening a context. -Has two forms: - \f[C]\f[R] Force OPX provider to use -\f[C]hfi-unit\f[R]. -- \f[C][,[,...,]]\f[R] Select HFI based -on first matching \f[C]selector\f[R] +Has two forms: - \f[V]\f[R] Force OPX provider to use +\f[V]hfi-unit\f[R]. +- \f[V][,[,...,]]\f[R] Select HFI based +on first matching \f[V]selector\f[R] .PP -Where \f[C]selector\f[R] is one of the following forms: - -\f[C]default\f[R] to use the default logic - \f[C]fixed:\f[R] -to fix to one \f[C]hfi-unit\f[R] - -\f[C]::\f[R] +Where \f[V]selector\f[R] is one of the following forms: - +\f[V]default\f[R] to use the default logic - \f[V]fixed:\f[R] +to fix to one \f[V]hfi-unit\f[R] - +\f[V]::\f[R] .PP -The above fields have the following meaning: - \f[C]selector-type\f[R] +The above fields have the following meaning: - \f[V]selector-type\f[R] The selector criteria the caller opening the context is evaluated against. -- \f[C]hfi-unit\f[R] The HFI to use if the caller matches the selector. -- \f[C]selector-data\f[R] Data the caller must match (e.g.\ NUMA node +- \f[V]hfi-unit\f[R] The HFI to use if the caller matches the selector. +- \f[V]selector-data\f[R] Data the caller must match (e.g.\ NUMA node ID). .PP -Where \f[C]selector-type\f[R] is one of the following: - \f[C]numa\f[R] +Where \f[V]selector-type\f[R] is one of the following: - \f[V]numa\f[R] True when caller is local to the NUMA node ID given by -\f[C]selector-data\f[R]. -- \f[C]core\f[R] True when caller is local to the CPU core given by -\f[C]selector-data\f[R]. +\f[V]selector-data\f[R]. +- \f[V]core\f[R] True when caller is local to the CPU core given by +\f[V]selector-data\f[R]. .PP -And \f[C]selector-data\f[R] is one of the following: - \f[C]value\f[R] -The specific value to match - \f[C]-\f[R] +And \f[V]selector-data\f[R] is one of the following: - \f[V]value\f[R] +The specific value to match - \f[V]-\f[R] Matches with any value in that range .PP In the second form, when opening a context, OPX uses the -\f[C]hfi-unit\f[R] of the first-matching selector. +\f[V]hfi-unit\f[R] of the first-matching selector. Selectors are evaluated left-to-right. OPX will return an error if the caller does not match any selector. .PP @@ -202,27 +232,27 @@ For the second form, as which HFI is selected depends on properties of the caller, deterministic HFI selection requires deterministic caller properties. E.g. -for the \f[C]numa\f[R] selector, if the caller can migrate between NUMA +for the \f[V]numa\f[R] selector, if the caller can migrate between NUMA domains, then HFI selection will not be deterministic. .PP The logic used will always be the first valid in a selector list. -For example, \f[C]default\f[R] and \f[C]fixed\f[R] will match all +For example, \f[V]default\f[R] and \f[V]fixed\f[R] will match all callers, so if either are in the beginning of a selector list, you will -only use \f[C]fixed\f[R] or \f[C]default\f[R] regardles of if there are +only use \f[V]fixed\f[R] or \f[V]default\f[R] regardles of if there are any more selectors. .PP -Examples: - \f[C]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts +Examples: - \f[V]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts on HFI 0. -- \f[C]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] +- \f[V]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. +- \f[V]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] callers local to NUMA nodes 0 and 2 will use HFI 0, callers local to NUMA domains 1 and 3 will use HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA +- \f[V]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA nodes 0 thru 3 (including 0 and 3) will use HFI 0, and all else will use default selection logic. -- \f[C]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core +- \f[V]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core 0 will use HFI 1, and all others will use HFI 0. -- \f[C]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use +- \f[V]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use default HFI selection logic. .TP \f[I]FI_OPX_DELIVERY_COMPLETION_THRESHOLD\f[R] @@ -252,6 +282,14 @@ using PIO. Value must be between 64 and 2147483646. Defaults to 16385. .TP +\f[I]FI_OPX_TID_MIN_PAYLOAD_BYTES\f[R] +Integer. +The minimum length in bytes where TID (Expected Receive) will be used. +For messages smaller than this threshold, the send will be completed +using Eager Receive. +Value must be between 4096 and 2147483646. +Defaults to 4096. +.TP \f[I]FI_OPX_RZV_MIN_PAYLOAD_BYTES\f[R] Integer. The minimum length in bytes where rendezvous will be used. @@ -274,9 +312,9 @@ This feature is not currently supported. \f[I]FI_OPX_PROG_AFFINITY\f[R] String. This sets the affinity to be used for any progress threads. -Set as a colon-separated triplet as \f[C]start:end:stride\f[R], where +Set as a colon-separated triplet as \f[V]start:end:stride\f[R], where stride controls the interval between selected cores. -For example, \f[C]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores +For example, \f[V]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores for progress threads. By default no affinity is set. .TP @@ -312,8 +350,14 @@ registered copy when receiving data into GPU. The default threshold is 8192. This has no meaning if Libfabric was not configured with GDRCopy or ROCR support. +.TP +\f[I]FI_OPX_MIXED_NETWORK\f[R] +Integer. +Indicates that the network is a mix of OPA100 and CN5000. +Needs to be set to 1 in case of mixed network. +Default is 0. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index fb5f2541d83..48547f0479d 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,13 +1,27 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "7" "2024\-03\-18" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP fi_provider - Fabric Interface Providers .SH OVERVIEW .PP -See \f[C]fi_arch\f[R](7) for a brief description of how providers fit +See \f[V]fi_arch\f[R](7) for a brief description of how providers fit into the libfabric architecture. .PP Conceptually, a fabric provider implements and maps the libfabric API @@ -74,52 +88,56 @@ This list is not exhaustive. .TP \f[I]CXI\f[R] Provider for Cray\[cq]s Slingshot network. -See \f[C]fi_cxi\f[R](7) for more information. +See \f[V]fi_cxi\f[R](7) for more information. .TP \f[I]EFA\f[R] A provider for the Amazon EC2 Elastic Fabric Adapter (EFA) (https://aws.amazon.com/hpc/efa/), a custom-built OS bypass hardware interface for inter-instance communication on EC2. -See \f[C]fi_efa\f[R](7) for more information. +See \f[V]fi_efa\f[R](7) for more information. +.TP +\f[I]LPP\f[R] +A provider runs on FabreX PCIe networks. +See \f[V]fi_lpp\f[R](7) for more information. .TP \f[I]OPX\f[R] Supports Omni-Path networking from Cornelis Networks. -See \f[C]fi_opx\f[R](7) for more information. +See \f[V]fi_opx\f[R](7) for more information. .TP \f[I]PSM2\f[R] Older provider for Omni-Path networks. -See \f[C]fi_psm2\f[R](7) for more information. +See \f[V]fi_psm2\f[R](7) for more information. .TP \f[I]PSM3\f[R] Provider for Ethernet networking from Intel. -See \f[C]fi_psm3\f[R](7) for more information. +See \f[V]fi_psm3\f[R](7) for more information. .TP \f[I]SHM\f[R] A provider for intra-node communication using shared memory. -See \f[C]fi_shm\f[R](7) for more information. +See \f[V]fi_shm\f[R](7) for more information. .TP \f[I]TCP\f[R] A provider which runs over the TCP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_tcp\f[R](7) for more information. +See \f[V]fi_tcp\f[R](7) for more information. .TP \f[I]UCX\f[R] A provider which runs over the UCX library which is currently supported by Infiniband fabrics from NVIDIA. -See \f[C]fi_ucx\f[R](7) for more information. +See \f[V]fi_ucx\f[R](7) for more information. .TP \f[I]UDP\f[R] A provider which runs over the UDP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_udp\f[R](7) for more information. +See \f[V]fi_udp\f[R](7) for more information. .TP \f[I]Verbs\f[R] This provider targets RDMA NICs for both Linux and Windows platforms. -See \f[C]fi_verbs\f[R](7) for more information. +See \f[V]fi_verbs\f[R](7) for more information. .SH Utility Providers .PP Utility providers are named with a starting prefix of \[lq]ofi_\[rq]. @@ -132,17 +150,17 @@ simpler endpoint type. .PP Utility providers show up as part of the return\[cq]s provider\[cq]s name. -See \f[C]fi_fabric\f[R](3). +See \f[V]fi_fabric\f[R](3). Utility providers are enabled automatically for core providers that do not support the feature set requested by an application. .TP \f[I]RxM\f[R] Implements RDM endpoint semantics over MSG endpoints. -See \f[C]fi_rxm\f[R](7) for more information. +See \f[V]fi_rxm\f[R](7) for more information. .TP \f[I]RxD\f[R] Implements RDM endpoint semantis over DGRAM endpoints. -See \f[C]fi_rxd\f[R](7) for more information. +See \f[V]fi_rxd\f[R](7) for more information. .SH Hooking Providers .PP Hooking providers are mostly used for debugging purposes. @@ -153,7 +171,7 @@ Hooking providers can layer over all other providers and intercept, or hook, their calls in order to perform some dedicated task, such as gathering performance data on call paths or providing debug output. .PP -See \f[C]fi_hook\f[R](7) for more information. +See \f[V]fi_hook\f[R](7) for more information. .SH Offload Providers .PP Offload providers start with the naming prefix \[lq]off_\[rq]. @@ -163,8 +181,22 @@ An offload provider is intended to accelerate specific types of communication, generally by taking advantage of network services that have been offloaded into hardware, though actual hardware offload support is not a requirement. +.SH LINKx (LNX) provider (Technology Preview) +.PP +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.PP +See \f[V]fi_lnx\f[R](7) for more information. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_provider\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_provider\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm2.7 b/man/man7/fi_psm2.7 index 38e6fef9084..95009032ff2 100644 --- a/man/man7/fi_psm2.7 +++ b/man/man7/fi_psm2.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm2" "7" "2023\-06\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm2" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -214,11 +228,11 @@ See \f[I]FI_PSM2_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -324,6 +338,6 @@ Valid parameter names are defined in the header file \f[I]rdma/fi_ext_psm2.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm3\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm3\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index e66e84677a9..172eabb1aa8 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm3" "7" "2023\-06\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm3" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -11,7 +25,7 @@ The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging capability which supports most verbs UD and sockets devices. Additional features and optimizations can be enabled when running over Intel\[cq]s E810 Ethernet NICs and/or using Intel\[cq]s rendezvous -kernel module (\f[C]rv\f[R]). +kernel module (\f[V]rv\f[R]). PSM 3.x fully integrates the OFI provider and the underlying PSM3 protocols/implementation and only exports the OFI APIs. .SH SUPPORTED FEATURES @@ -209,11 +223,11 @@ See \f[I]FI_PSM3_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -304,6 +318,6 @@ Notice that if the provider is compiled with macro runtime option will be disabled. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm2\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm2\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxd.7 b/man/man7/fi_rxd.7 index 7590676f118..11a2ac85f18 100644 --- a/man/man7/fi_rxd.7 +++ b/man/man7/fi_rxd.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxd" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxd" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -61,6 +75,6 @@ Maximum number of packets (per peer) to send at a time. Default: 128 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index 037fc74d926..294f43eba2c 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxm" "7" "2024\-03\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxm" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -212,7 +226,7 @@ to only required values. .PP The data transfer API may return -FI_EAGAIN during on-demand connection setup of the core provider FI_MSG_EP. -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH Troubleshooting / Known issues .PP @@ -229,6 +243,6 @@ The workaround is to use shared receive contexts for the MSG provider (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index ebb88bddee3..1c7e42dfd44 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_setup" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_setup" "7" "2024\-12\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -159,9 +173,9 @@ fail the fi_getinfo request. A provider may optionally report non-requested secondary capabilities if doing so would not compromise performance or security. That is, a provider may grant an application a secondary capability, -whether the application. +regardless of whether the application requested it. The most commonly accessed secondary capability bits indicate if -provider communication is restricted to the local node Ifor example, the +provider communication is restricted to the local node (for example, the shared memory provider only supports local communication) and/or remote nodes (which can be the case for NICs that lack loopback support). Other secondary capability bits mostly deal with features targeting @@ -459,8 +473,9 @@ libfabric defines a unique threading model. The libfabric design is heavily influenced by object-oriented programming concepts. A multi-threaded application must determine how libfabric objects -(domains, endpoints, completion queues, etc.) will be allocated among -its threads, or if any thread can access any object. +(domains, endpoints, completion queues, etc.) +will be allocated among its threads, or if any thread can access any +object. For example, an application may spawn a new thread to handle each new connected endpoint. The domain threading field provides a mechanism for an application to diff --git a/man/man7/fi_shm.7 b/man/man7/fi_shm.7 index 6353533c667..f28656601d4 100644 --- a/man/man7/fi_shm.7 +++ b/man/man7/fi_shm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_shm" "7" "2023\-08\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_shm" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -89,7 +103,7 @@ was provided by the application), no supplemental information is required to make it unique and it will remain with only the application-defined address. Note that the actual endpoint name will not include the FI_ADDR_STR -\[dq]*://\[dq] prefix since it cannot be included in any shared memory +\[lq]*://\[rq] prefix since it cannot be included in any shared memory region names. The provider will strip off the prefix before setting the endpoint name. As a result, the addresses \[lq]fi_prefix1://my_node:my_service\[rq] and @@ -204,6 +218,6 @@ different systems. Default 262144 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_sockets.7 b/man/man7/fi_sockets.7 index 4f48b3ea613..133fdaedb65 100644 --- a/man/man7/fi_sockets.7 +++ b/man/man7/fi_sockets.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_sockets" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_sockets" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -128,6 +142,6 @@ The recommended parameters for large scale runs are \f[I]FI_SOCKETS_DEF_CQ_SZ\f[R], \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_tcp.7 b/man/man7/fi_tcp.7 index bf3bfe4d8fb..b25c0958206 100644 --- a/man/man7/fi_tcp.7 +++ b/man/man7/fi_tcp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tcp" "7" "2023\-03\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tcp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -114,6 +128,6 @@ from the tcp provider. This will provide the best performance. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_ucx.7 b/man/man7/fi_ucx.7 index c3b3e48c868..440a76b429a 100644 --- a/man/man7/fi_ucx.7 +++ b/man/man7/fi_ucx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_ucx" "7" "2023\-02\-24" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_ucx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -52,6 +66,6 @@ any). Check request leak (default: disabled). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_udp.7 b/man/man7/fi_udp.7 index ea65939d131..a020f7a54c3 100644 --- a/man/man7/fi_udp.7 +++ b/man/man7/fi_udp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_udp" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_udp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -51,6 +65,6 @@ No support for counters. No runtime parameters are currently defined. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_usnic.7 b/man/man7/fi_usnic.7 index cf03f28a0f7..e9104cfd0bd 100644 --- a/man/man7/fi_usnic.7 +++ b/man/man7/fi_usnic.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_usnic" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_usnic" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -29,7 +43,7 @@ installing libnl from RPM or other packaging system, install the If you have libnl (either v1 or v3) installed in a non-standard location (e.g., not in /usr/lib or /usr/lib64), you may need to tell libfabric\[cq]s configure where to find libnl via the -\f[C]--with-libnl=DIR\f[R] command line option (where DIR is the +\f[V]--with-libnl=DIR\f[R] command line option (where DIR is the installation prefix of the libnl package). .RE .IP \[bu] 2 @@ -56,7 +70,7 @@ In particular, there are known bugs in RDM support in the presence of congestion or packet loss (issue 1621). RMA is not yet supported. .IP \[bu] 2 -\f[C]fi_provider\f[R](7) lists requirements for all providers. +\f[V]fi_provider\f[R](7) lists requirements for all providers. The following limitations exist in the \f[I]usnic\f[R] provider: .RS 2 .IP \[bu] 2 @@ -69,13 +83,13 @@ CM operations. Passive endpoints only support listen, setname, and getname CM operations. .IP \[bu] 2 -\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and -\f[C]fi_recvmsg()\f[R], but some flags are ignored. -\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and -\f[C]FI_COMPLETION\f[R]. -\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R]. +\f[I]FI_EP_DGRAM\f[R] endpoints support \f[V]fi_sendmsg()\f[R] and +\f[V]fi_recvmsg()\f[R], but some flags are ignored. +\f[V]fi_sendmsg()\f[R] supports \f[V]FI_INJECT\f[R] and +\f[V]FI_COMPLETION\f[R]. +\f[V]fi_recvmsg()\f[R] supports \f[V]FI_MORE\f[R]. .IP \[bu] 2 -Address vectors only support \f[C]FI_AV_MAP\f[R]. +Address vectors only support \f[V]FI_AV_MAP\f[R]. .IP \[bu] 2 No counters are supported. .IP \[bu] 2 @@ -119,19 +133,19 @@ file. Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in Libfabric release v1.3.0 and can be used to retrieve IP and SR-IOV information about a usNIC device obtained from the -\f[C]fi_getinfo\f[R](3) function. +\f[V]fi_getinfo\f[R](3) function. .PP The \[lq]fabric getinfo\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_FABRIC_OPS_1\f[R] to get the usNIC fabric extension operations. -The \f[C]getinfo\f[R] function accepts a version parameter that can be +The \f[V]getinfo\f[R] function accepts a version parameter that can be used to select different versions of the extension. The information returned by the \[lq]fabric getinfo\[rq] extension is -accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version +accessible through a \f[V]fi_usnic_info\f[R] struct that uses a version tagged union. The accessed union member must correspond with the requested version. It is recommended that applications explicitly request a version rather -than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R]. +than using the header provided \f[V]FI_EXT_USNIC_INFO_VERSION\f[R]. Although there is a version 1 of the extension, its use is discouraged, and it may not be available in future releases. .SS Compatibility issues @@ -244,8 +258,8 @@ struct fi_usnic_info_v1 { .fi .PP Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by -explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing -the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union. +explicitly requesting it in the call to \f[V]getinfo\f[R] and accessing +the \f[V]v1\f[R] portion of the \f[V]fi_usnic_info.ui\f[R] union. Use of version 1 is not recommended and it may be removed from future releases. .PP @@ -327,7 +341,7 @@ Libfabric release v1.0.0 and can be used to retrieve the network distance of an address. .PP The \[lq]get_distance\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_AV_OPS_1\f[R] to get the usNIC address vector extension operations. .IP .nf @@ -343,9 +357,9 @@ Address vector Destination address .TP \f[I]metric\f[R] -On output this will contain \f[C]-1\f[R] if the destination host is -unreachable, \f[C]0\f[R] is the destination host is locally connected, -and \f[C]1\f[R] otherwise. +On output this will contain \f[V]-1\f[R] if the destination host is +unreachable, \f[V]0\f[R] is the destination host is locally connected, +and \f[V]1\f[R] otherwise. .PP See fi_ext_usnic.h for more details. .SH VERSION DIFFERENCES @@ -355,28 +369,28 @@ The release of libfabric v1.4 introduced a new naming convention for fabric and domain. However the usNIC provider remains backward compatible with applications supporting the old scheme and decides which one to use based on the -version passed to \f[C]fi_getinfo\f[R]: +version passed to \f[V]fi_getinfo\f[R]: .IP \[bu] 2 -When \f[C]FI_VERSION(1,4)\f[R] or higher is used: +When \f[V]FI_VERSION(1,4)\f[R] or higher is used: .RS 2 .IP \[bu] 2 fabric name is the network address with the CIDR notation (i.e., -\f[C]a.b.c.d/e\f[R]) +\f[V]a.b.c.d/e\f[R]) .IP \[bu] 2 -domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +domain name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .RE .IP \[bu] 2 -When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it +When a lower version number is used, like \f[V]FI_VERSION(1, 3)\f[R], it follows the same behavior the usNIC provider exhibited in libfabric <= v1.3: .RS 2 .IP \[bu] 2 -fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +fabric name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .IP \[bu] 2 -domain name is \f[C]NULL\f[R] +domain name is \f[V]NULL\f[R] .RE .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_open_ops\f[R](3), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_verbs.7 b/man/man7/fi_verbs.7 index a4b8653ea0d..1a5e3f11794 100644 --- a/man/man7/fi_verbs.7 +++ b/man/man7/fi_verbs.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_verbs" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_verbs" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -147,7 +161,7 @@ to be re-mapped when the process is forked (MADV_DONTFORK). .PP The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. -See \f[C]fi_rxm\f[R](7). +See \f[V]fi_rxm\f[R](7). To enable XRC, the following environment variables must usually be set: FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. .SH RUNTIME PARAMETERS @@ -280,6 +294,6 @@ post excess receives without draining the CQ. CQ overruns can make the MSG endpoints unusable. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/prov/cxi/Makefile.include b/prov/cxi/Makefile.include index b529f942ce7..9faa7874d8f 100644 --- a/prov/cxi/Makefile.include +++ b/prov/cxi/Makefile.include @@ -129,7 +129,8 @@ nodist_prov_cxi_test_cxitest_SOURCES = \ prov/cxi/test/auth_key.c \ prov/cxi/test/fork.c \ prov/cxi/test/mem_reg.c \ - prov/cxi/test/nic.c + prov/cxi/test/nic.c \ + prov/cxi/test/mr_cache.c prov_cxi_test_cxitest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 index b8b53d9fdb3..cab843f47a2 100644 --- a/prov/cxi/configure.m4 +++ b/prov/cxi/configure.m4 @@ -35,8 +35,10 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ [CPPFLAGS="-I$with_cxi_uapi_headers/include $CPPFLAGS"]) # Support non-standard install path for curl. This is needed by CXI provider. + # Add #define of the path to the curl library for use in the code AC_ARG_WITH([curl], - [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])]) + [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])], + [AC_DEFINE_UNQUOTED([FI_CXI_CURL_LIB_PATH], ["$with_curl"], [Path to the curl install root])]) # Support non-standard install path for json-c. This is needed by CXI provider. AC_ARG_WITH([json-c], @@ -97,8 +99,8 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ cxi_LIBS="$cxi_LIBS $libcurl_LIBS" # Add on json if installed in non-default location. - if test "$with_json" != "" && test "$with_json" != "no"; then - FI_CHECK_PREFIX_DIR([$with_json], [json]) + if test "$with_json_c" != "" && test "$with_json_c" != "no"; then + FI_CHECK_PREFIX_DIR([$with_json_c], [json]) else json_PREFIX="" json_LIBDIR="" diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index b959ff99eaa..70ef46a2a69 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -4,7 +4,7 @@ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. - * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2024 Hewlett Packard Enterprise Development LP */ #ifndef _CXIP_PROV_H_ @@ -120,8 +120,9 @@ #define CXIP_REQ_BUF_SIZE (2*1024*1024) #define CXIP_REQ_BUF_MIN_POSTED 4 #define CXIP_REQ_BUF_MAX_CACHED 0 -#define CXIP_UX_BUFFER_SIZE (CXIP_OFLOW_BUF_MIN_POSTED * \ - CXIP_OFLOW_BUF_SIZE) + +#define CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS 100000U +#define CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS 1000000000U /* When device memory is safe to access via load/store then the * CPU will be used to move data below this threshold. @@ -131,25 +132,23 @@ #define CXIP_EP_PRI_CAPS \ (FI_RMA | FI_ATOMICS | FI_TAGGED | FI_RECV | FI_SEND | \ FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE | \ - FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | \ - FI_COLLECTIVE | FI_HMEM) + FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | FI_HMEM | \ + FI_COLLECTIVE) #define CXIP_EP_SEC_CAPS \ (FI_SOURCE | FI_SOURCE_ERR | FI_LOCAL_COMM | \ FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER) #define CXIP_EP_CAPS (CXIP_EP_PRI_CAPS | CXIP_EP_SEC_CAPS) -#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) +#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID | FI_PEER) #define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS) #define CXIP_MSG_ORDER (FI_ORDER_SAS | \ FI_ORDER_WAW | \ FI_ORDER_RMA_WAW | \ + FI_ORDER_RMA_RAR | \ FI_ORDER_ATOMIC_WAW | \ - FI_ORDER_ATOMIC_WAR | \ - FI_ORDER_ATOMIC_RAW | \ FI_ORDER_ATOMIC_RAR) #define CXIP_EP_CQ_FLAGS \ - (FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION | \ - FI_COLLECTIVE) + (FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION) #define CXIP_EP_CNTR_FLAGS \ (FI_SEND | FI_RECV | FI_READ | FI_WRITE | FI_REMOTE_READ | \ FI_REMOTE_WRITE) @@ -177,7 +176,7 @@ #define CXIP_MINOR_VERSION 1 #define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ CXIP_MINOR_VERSION) -#define CXIP_FI_VERSION FI_VERSION(1, 21) +#define CXIP_FI_VERSION FI_VERSION(2, 0) #define CXIP_WIRE_PROTO_VERSION 1 #define CXIP_COLL_MAX_CONCUR 8 @@ -185,19 +184,24 @@ #define CXIP_COLL_MIN_RX_SIZE 4096 #define CXIP_COLL_MIN_MULTI_RECV 64 #define CXIP_COLL_MAX_DATA_SIZE 32 -#define CXIP_COLL_MAX_SEQNO (1 << 10) +#define CXIP_COLL_MAX_SEQNO ((1 << 10) - 1) +#define CXIP_COLL_MOD_SEQNO (CXIP_COLL_MAX_SEQNO - 1) + // TODO adjust based on performance testing -#define CXIP_COLL_MIN_RETRY_USEC 1 -#define CXIP_COLL_MAX_RETRY_USEC 32000 -#define CXIP_COLL_MIN_TIMEOUT_USEC 1 -#define CXIP_COLL_MAX_TIMEOUT_USEC 32000 +#define CXIP_COLL_MIN_RETRY_USEC 1 +#define CXIP_COLL_MAX_RETRY_USEC 32000 +#define CXIP_COLL_MIN_TIMEOUT_USEC 1 +#define CXIP_COLL_MAX_TIMEOUT_USEC 32000 +#define CXIP_COLL_MIN_FM_TIMEOUT_MSEC 1 +#define CXIP_COLL_DFL_FM_TIMEOUT_MSEC 100 +#define CXIP_COLL_MAX_FM_TIMEOUT_MSEC 1000000 #define CXIP_REQ_BUF_HEADER_MAX_SIZE (sizeof(struct c_port_fab_hdr) + \ sizeof(struct c_port_unrestricted_hdr)) #define CXIP_REQ_BUF_HEADER_MIN_SIZE (sizeof(struct c_port_fab_hdr) + \ sizeof(struct c_port_small_msg_hdr)) -extern int s_page_size; +extern int sc_page_size; extern char cxip_prov_name[]; extern struct fi_provider cxip_prov; extern struct util_prov cxip_util_prov; @@ -240,6 +244,19 @@ enum cxip_rdzv_proto { const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto); +enum cxip_mr_target_ordering { + /* Sets MR target ordering based on message and target RMA ordering + * options. + */ + MR_ORDER_DEFAULT, + + /* Force ordering to always be strict. */ + MR_ORDER_STRICT, + + /* Force ordering to always be relaxed. */ + MR_ORDER_RELAXED, +}; + struct cxip_environment { /* Translation */ int odp; @@ -293,7 +310,6 @@ struct cxip_environment { size_t ctrl_rx_eq_max_size; char *device_name; size_t cq_fill_percent; - int enable_unrestricted_end_ro; int rget_tc; int cacheline_size; @@ -301,6 +317,7 @@ struct cxip_environment { char *coll_job_step_id; size_t coll_retry_usec; size_t coll_timeout_usec; + size_t coll_fm_timeout_msec; char *coll_fabric_mgr_url; char *coll_mcast_token; size_t hwcoll_addrs_per_job; @@ -316,6 +333,10 @@ struct cxip_environment { int enable_trig_op_limit; int hybrid_posted_recv_preemptive; int hybrid_unexpected_msg_preemptive; + size_t mr_cache_events_disable_poll_nsecs; + size_t mr_cache_events_disable_le_poll_nsecs; + int force_dev_reg_copy; + enum cxip_mr_target_ordering mr_target_ordering; }; extern struct cxip_environment cxip_env; @@ -713,7 +734,7 @@ struct cxip_lni { /* Software remapped communication profiles. */ struct dlist_entry remap_cps; - ofi_spin_t lock; + pthread_rwlock_t cp_lock; }; /* A portals table define a network endpoint address. The endpoint address is @@ -800,8 +821,10 @@ struct cxip_md { struct cxi_md *md; struct ofi_mr_info info; uint64_t handle; + int dmabuf_fd; bool handle_valid; bool cached; + bool dmabuf_fd_valid; }; #define CXIP_MR_DOMAIN_HT_BUCKETS 16 @@ -853,6 +876,9 @@ struct cxip_domain { ofi_spin_t lock; ofi_atomic32_t ref; + struct fid_ep rx_ep; + struct fid_peer_srx *owner_srx; + uint32_t tclass; struct cxip_eq *eq; //unused @@ -1052,7 +1078,7 @@ struct cxip_eq { }; #define CXIP_EQ_MAP_FLAGS \ - (CXI_MAP_WRITE | CXI_MAP_PIN | CXI_MAP_IOVA_ALLOC) + (CXI_MAP_WRITE | CXI_MAP_PIN) /* * RMA request @@ -1144,7 +1170,7 @@ struct cxip_req_recv { uint32_t rdzv_initiator; // Rendezvous initiator used for mrecvs uint32_t rget_nic; uint32_t rget_pid; - bool software_list; // Appended to HW or SW + int multirecv_inflight; // SW EP Multi-receives in progress bool canceled; // Request canceled? bool unlinked; bool multi_recv; @@ -1263,6 +1289,8 @@ struct cxip_req { uint64_t trig_thresh; struct cxip_cntr *trig_cntr; + struct fi_peer_rx_entry *rx_entry; + /* CQ event fields, set according to fi_cq.3 * - set by provider * - returned to user in completion event @@ -1394,8 +1422,8 @@ struct cxip_cq { */ struct ofi_genlock ep_list_lock; - /* Internal CXI wait object allocated only if required. */ - struct cxil_wait_obj *priv_wait; + /* CXI CQ wait object EPs are maintained in epoll FD */ + int ep_fd; /* CXI specific fields. */ struct cxip_domain *domain; @@ -1436,6 +1464,8 @@ struct cxip_cntr { struct cxip_ux_send { struct dlist_entry rxc_entry; struct cxip_req *req; + struct cxip_rxc *rxc; + struct fi_peer_rx_entry *rx_entry; union c_event put_ev; bool claimed; /* Reserved with FI_PEEK | FI_CLAIM */ }; @@ -1850,7 +1880,7 @@ struct cxip_rxc { struct cxip_evtq rx_evtq; struct cxip_pte *rx_pte; struct cxip_cmdq *rx_cmdq; - ofi_atomic32_t orx_reqs; + int orx_reqs; /* If FI_MULTI_RECV is supported, minimum receive size required * for buffers posted. @@ -1948,13 +1978,16 @@ struct cxip_rxc_rnr { }; static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle, dest, src, size); assert(ret == FI_SUCCESS); @@ -1970,13 +2003,16 @@ static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, } static inline void cxip_copy_from_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface, md->handle, dest, src, size); @@ -2212,7 +2248,7 @@ struct cxip_txc { struct ofi_bufpool *ibuf_pool; struct cxip_cmdq *tx_cmdq; // added during cxip_txc_enable() - ofi_atomic32_t otx_reqs; // outstanding transmit requests + int otx_reqs; // outstanding transmit requests /* Queue of TX messages in flight for the context */ struct dlist_entry msg_queue; @@ -2370,6 +2406,8 @@ struct cxip_ep_obj { struct cxip_domain *domain; struct cxip_av *av; + struct fid_peer_srx *owner_srx; + /* Domain has been configured with FI_AV_AUTH_KEY. */ bool av_auth_key; @@ -2392,6 +2430,10 @@ struct cxip_ep_obj { struct cxip_txc *txc; struct cxip_rxc *rxc; + /* Internal support for CQ wait object */ + struct cxil_wait_obj *priv_wait; + int wait_fd; + /* ASIC version associated with EP/Domain */ enum cassini_version asic_ver; @@ -2421,6 +2463,9 @@ struct cxip_ep_obj { struct fi_tx_attr tx_attr; struct fi_rx_attr rx_attr; + /* Require memcpy's via the dev reg APIs. */ + bool require_dev_reg_copy[OFI_HMEM_MAX]; + /* Collectives support */ struct cxip_ep_coll_obj coll; struct cxip_ep_zbcoll_obj zbcoll; @@ -2431,6 +2476,92 @@ struct cxip_ep_obj { struct cxip_portals_table *ptable; }; +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md); + +static inline void +cxip_ep_obj_copy_to_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_to_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + +static inline void +cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_from_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + +static inline bool cxip_ep_obj_mr_relaxed_order(struct cxip_ep_obj *ep) +{ + if (cxip_env.mr_target_ordering == MR_ORDER_STRICT) + return false; + + if (cxip_env.mr_target_ordering == MR_ORDER_RELAXED) + return true; + + if ((ep->rx_attr.msg_order & FI_ORDER_RMA_WAW) && + ep->ep_attr.max_order_waw_size != 0) + return false; + + if ((ep->rx_attr.msg_order & FI_ORDER_WAW) && + ep->ep_attr.max_order_waw_size != 0) + return false; + + return true; +} + +static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + txc->otx_reqs++; +} + +static inline void cxip_txc_otx_reqs_dec(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + txc->otx_reqs--; + assert(txc->otx_reqs >= 0); +} + +static inline int cxip_txc_otx_reqs_get(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + return txc->otx_reqs; +} + +static inline void cxip_txc_otx_reqs_init(struct cxip_txc *txc) +{ + txc->otx_reqs = 0; +} + +static inline void cxip_rxc_orx_reqs_inc(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + rxc->orx_reqs++; +} + +static inline void cxip_rxc_orx_reqs_dec(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + rxc->orx_reqs--; + assert(rxc->orx_reqs >= 0); +} + +static inline int cxip_rxc_orx_reqs_get(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + return rxc->orx_reqs; +} + +static inline void cxip_rxc_orx_reqs_init(struct cxip_rxc *rxc) +{ + rxc->orx_reqs = 0; +} + /* * CXI endpoint implementations to support FI_CLASS_EP. */ @@ -2711,18 +2842,7 @@ enum cxip_coll_state { CXIP_COLL_STATE_FAULT, }; -/* Similar to C_RC_* provider errors, but pure libfabric */ -/* These should be in priority order, from lowest to highest */ -enum cxip_coll_prov_errno { - CXIP_PROV_ERRNO_OK = -1, // good - CXIP_PROV_ERRNO_PTE = -2, // PTE setup failure - CXIP_PROV_ERRNO_MCAST_INUSE = -3, // multicast in-use - CXIP_PROV_ERRNO_HWROOT_INUSE = -4, // hwroot in-use - CXIP_PROV_ERRNO_MCAST_INVALID = -5, // multicast invalid - CXIP_PROV_ERRNO_HWROOT_INVALID = -6, // hwroot invalid - CXIP_PROV_ERRNO_CURL = -7, // CURL failure - CXIP_PROV_ERRNO_LAST = -8, // last error code (unused) -}; +const char *cxip_strerror(int prov_errno); /* Rosetta reduction engine error codes */ typedef enum cxip_coll_rc { @@ -2778,6 +2898,33 @@ struct cxip_coll_data { bool initialized; }; +struct coll_counters { + int32_t coll_recv_cnt; + int32_t send_cnt; + int32_t recv_cnt; + int32_t pkt_cnt; + int32_t seq_err_cnt; + int32_t tmout_cnt; +}; + +struct cxip_coll_metrics_ep { + int myrank; + bool isroot; +}; +struct cxip_coll_metrics { + long red_count_bad; + long red_count_full; + long red_count_partial; + long red_count_unreduced; + struct cxip_coll_metrics_ep ep_data; +}; + +void cxip_coll_reset_mc_ctrs(struct fid_mc *mc); +void cxip_coll_get_mc_ctrs(struct fid_mc *mc, struct coll_counters *counters); + +void cxip_coll_init_metrics(void); +void cxip_coll_get_metrics(struct cxip_coll_metrics *metrics); + struct cxip_coll_reduction { struct cxip_coll_mc *mc_obj; // parent mc_obj uint32_t red_id; // reduction id @@ -2807,6 +2954,7 @@ struct cxip_coll_mc { struct cxip_zbcoll_obj *zb; // zb object for zbcol struct cxip_coll_pte *coll_pte; // collective PTE struct timespec timeout; // state machine timeout + struct timespec curlexpires; // CURL delete expiration timeout fi_addr_t mynode_fiaddr; // fi_addr of this node int mynode_idx; // av_set index of this node uint32_t hwroot_idx; // av_set index of hwroot node @@ -2815,6 +2963,9 @@ struct cxip_coll_mc { int next_red_id; // next available red_id int max_red_id; // limit total concurrency int seqno; // rolling seqno for packets + int close_state; // the state of the close operation + bool has_closed; // true after a mc close call + bool has_error; // true if any error bool is_multicast; // true if multicast address bool arm_disable; // arm-disable for testing bool retry_disable; // retry-disable for testing @@ -3003,7 +3154,8 @@ static inline bool cxip_cmdq_match(struct cxip_cmdq *cmdq, uint16_t vni, } int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, - size_t num_events, size_t num_fc_events); + size_t num_events, size_t num_fc_events, + struct cxil_wait_obj *priv_wait); void cxip_evtq_fini(struct cxip_evtq *eq); int cxip_domain(struct fid_fabric *fabric, struct fi_info *info, @@ -3083,6 +3235,9 @@ int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src); int cxip_cq_req_error(struct cxip_req *req, size_t olen, int err, int prov_errno, void *err_data, size_t err_data_size, fi_addr_t src_addr); +int cxip_cq_add_wait_fd(struct cxip_cq *cq, int wait_fd, int events); +void cxip_cq_del_wait_fd(struct cxip_cq *cq, int wait_fd); + int proverr2errno(int err); struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq, int remap, void *req_ctx); @@ -3090,9 +3245,9 @@ void cxip_evtq_req_free(struct cxip_req *req); void cxip_evtq_progress(struct cxip_evtq *evtq); void cxip_ep_progress(struct fid *fid); -int cxip_ep_peek(struct fid *fid); void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj); +int cxip_cq_trywait(struct cxip_cq *cq); void cxip_cq_progress(struct cxip_cq *cq); void cxip_util_cq_progress(struct util_cq *util_cq); int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, @@ -3121,8 +3276,7 @@ void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj); void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj); int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj); void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj); -void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj); -int cxip_ep_ctrl_trywait(void *arg); +int cxip_ep_trywait(struct cxip_ep_obj *ep_obj, struct cxip_cq *cq); int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr, struct fid_av_set **av_set_fid, void * context); @@ -3165,8 +3319,6 @@ void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id); void cxip_coll_drop_send(struct cxip_coll_reduction *reduction); void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction); -void cxip_coll_reset_mc_ctrs(struct fid_mc *mc); - void cxip_dbl_to_rep(struct cxip_repsum *x, double d); void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x); void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y); @@ -3176,6 +3328,11 @@ double cxip_rep_sum(size_t count, double *values); int cxip_check_auth_key_info(struct fi_info *info); int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key); +static inline struct fid_peer_srx *cxip_get_owner_srx(struct cxip_rxc *rxc) +{ + return rxc->ep_obj->owner_srx; +} + #define CXIP_FC_SOFTWARE_INITIATED -1 /* cxip_fc_reason() - Returns the event reason for portal state @@ -3220,6 +3377,13 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr); +static inline int cxip_no_discard(struct fi_peer_rx_entry *rx_entry) +{ + return -FI_ENOSYS; +} + +int cxip_unexp_start(struct fi_peer_rx_entry *entry); + /* * Request variants: * CXIP_RQ_AMO @@ -3549,17 +3713,19 @@ cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md, */ if (!cxip_env.fork_safe_requested) { if (!hmem_md) { - ret = cxip_map(domain, hmem_src, size, 0, &hmem_md); + ret = cxip_ep_obj_map(txc->ep_obj, hmem_src, size, 0, + &hmem_md); if (ret) { - TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret, - fi_strerror(-ret)); + TXC_WARN(txc, "cxip_ep_obj_map failed: %d:%s\n", + ret, fi_strerror(-ret)); return ret; } unmap_hmem_md = true; } - cxip_copy_from_md(hmem_md, dest, hmem_src, size); + cxip_ep_obj_copy_from_md(txc->ep_obj, hmem_md, dest, hmem_src, + size); if (unmap_hmem_md) cxip_unmap(hmem_md); @@ -3631,7 +3797,9 @@ int cxip_set_recv_match_id(struct cxip_rxc *rxc, fi_addr_t src_addr, return FI_SUCCESS; } -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req); +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force); int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, struct cxip_md *md, struct cxip_req **cxip_req, int (*recv_cb)(struct cxip_req *req, @@ -3683,4 +3851,74 @@ int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni, struct c_dma_amo_cmd *amo, uint64_t flags, bool fetching, bool flush); +static inline void cxip_set_env_rx_match_mode(void) +{ + char *param_str = NULL; + + fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); + /* Parameters to tailor hybrid hardware to software transitions + * that are initiated by software. + */ + fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive UX transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_preemptive", + &cxip_env.hybrid_preemptive); + fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive recv transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", + &cxip_env.hybrid_recv_preemptive); + fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", + &cxip_env.hybrid_unexpected_msg_preemptive); + fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", + &cxip_env.hybrid_posted_recv_preemptive); + + if (param_str) { + if (!strcasecmp(param_str, "hardware")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } else if (!strcmp(param_str, "software")) { + cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; + cxip_env.msg_offload = false; + } else if (!strcmp(param_str, "hybrid")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; + cxip_env.msg_offload = true; + } else { + _CXIP_WARN(FI_LOG_FABRIC, "Unrecognized rx_match_mode: %s\n", + param_str); + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_preemptive) { + cxip_env.hybrid_preemptive = false; + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignoring preemptive\n"); + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore LE recv preemptive\n"); + cxip_env.hybrid_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); + cxip_env.hybrid_posted_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); + cxip_env.hybrid_unexpected_msg_preemptive = 0; + } +} + #endif diff --git a/prov/cxi/include/cxip_faults.h b/prov/cxi/include/cxip_faults.h index e9b28f17fe9..503a178e5dd 100644 --- a/prov/cxi/include/cxip_faults.h +++ b/prov/cxi/include/cxip_faults.h @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019-2024 Hewlett Packard Enterprise Development LP */ /* Fault injection. */ @@ -60,89 +60,14 @@ enum { CXIP_TRAP_GETGRP, CXIP_TRAP_BCAST, CXIP_TRAP_REDUCE, + CXIP_TRAP_CURL_FM_URL, + CXIP_TRAP_CURL_TOKEN, + CXIP_TRAP_HWROOT_INVAL, + CXIP_TRAP_HWROOT_INUSE, + CXIP_TRAP_MCAST_INUSE, CXIP_TRAP_INITPTE, - CXIP_TRAP_CURLSND, - CXIP_TRAP_CURLRCV, }; -#if ENABLE_DEBUG -/* structure used to simulate failures */ -struct _cxip_trap { - struct dlist_entry link; - int index; - int trap; - int err; -}; - -struct dlist_entry _trap_list; -bool _trap_initialized; - -static void _cxip_trap_close(void) -{ - struct _cxip_trap *trap_obj; - - if (!_trap_initialized) - return; - while (!dlist_empty(&_trap_list)) { - dlist_pop_front(&_trap_list, struct _cxip_trap, trap_obj, link); - free(trap_obj); - } -} - -static void _cxip_trap_set(int index, int trap, int err) -{ - struct _cxip_trap *trap_obj; - - if (!_trap_initialized) { - dlist_init(&_trap_list); - _trap_initialized = true; - } - trap_obj = calloc(1, sizeof(*trap_obj)); - if (!trap_obj) - return; - dlist_init(&trap_obj->link); - trap_obj->index = index; - trap_obj->trap = trap; - trap_obj->err = err; - dlist_insert_tail(&_trap_list, &trap_obj->link); -} - -static bool _cxip_trap_search(int index, int trap, int *err) -{ - struct _cxip_trap *trap_obj; - struct dlist_entry *item; - - if (!_trap_initialized) - return false; - - dlist_foreach(&_trap_list, item) { - trap_obj = container_of(item, struct _cxip_trap, link); - if (trap_obj->index != index) - continue; - if (trap_obj->trap != trap) - continue; - dlist_remove(item); - *err = trap_obj->err; - free(trap_obj); - return true; - } - return false; -} - -static inline void cxip_trap_close(void) -{ - _cxip_trap_close(); -} -static inline void cxip_trap_set(int index, int trap, int err) -{ - _cxip_trap_set(index, trap, err); -} -static inline bool cxip_trap_search(int index, int trap, int *err) -{ - return _cxip_trap_search(index, trap, err); -} -#else -static inline void cxip_trap_close(void) {} -static inline void cxip_trap_set(int a, int b, int c) {} -static inline bool cxip_trap_search(int a, int b, int *c) {return false;} -#endif +void cxip_trap_close(void); +void cxip_trap_set(int index, int trap, int err, int prov_errno); +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno); diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h index a2775cbc253..c4629d9ef7f 100644 --- a/prov/cxi/include/fi_cxi_ext.h +++ b/prov/cxi/include/fi_cxi_ext.h @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2020-2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ #ifndef _FI_CXI_EXT_H_ @@ -67,6 +67,15 @@ enum { */ #define FI_CXI_CNTR_EVENTS_BYTES 1 /* FI_CNTR_EVENTS_BYTES */ +/* + * CXI provider specific counter flag to return current/cached counter value + * in host memory. A request to update the count is requested, but the routine + * does not wait for the update to complete. Subsequent reads will pick up + * the updated counter value. The normal behavior is to wait for a memory update + * to complete (or to use the domain ops counter routines). + */ +#define FI_CXI_CNTR_CACHED (1ULL << 32) + /* * TODO: Set this to the upstream value prior to releasing software. * This flag returned in a completion and indicates that the message was @@ -109,11 +118,10 @@ enum { */ #define FI_CXI_UNRELIABLE (1ULL << 61) -/* - * Request a provider specific weak FENCE operation to facilitate an - * EP alias ordering point, when the original EP utilizes PCIe RO=1. - */ -#define FI_CXI_WEAK_FENCE (1ULL << 63) +/* Depreciated. */ +#define FI_CXI_WEAK_FENCE \ + _Pragma ("GCC warning \"'FI_CXI_WEAK_FENCE' macro is deprecated\"") \ + (1ULL << 63) /* * Used in conjunction with the deferred work queue API. If a deferred work @@ -396,6 +404,60 @@ enum cxip_comm_key_type { COMM_KEY_MAX }; +/* Extends C_RC_* driver errors for libfabric */ +/* Translated to strings by cxip_strerror() -- keep synchronized */ +enum cxip_coll_prov_errno { + /* C_RC_* from cxi-driver overlaps first 6 bits of space [0,63] */ + + /* collectives CQ reduction error codes + * highest number error predominates + */ + FI_CXI_ERRNO_RED_FIRST = 1024, + FI_CXI_ERRNO_RED_FLT_OVERFLOW = 1024, + /* double precision value overflow */ + FI_CXI_ERRNO_RED_FLT_INVALID = 1025, + /* double precision sNAN/inf value */ + FI_CXI_ERRNO_RED_INT_OVERFLOW = 1026, + /* reproducible sum overflow */ + FI_CXI_ERRNO_RED_CONTR_OVERFLOW = 1027, + /* reduction contribution overflow */ + FI_CXI_ERRNO_RED_OP_MISMATCH = 1028, + /* reduction opcode mismatch */ + FI_CXI_ERRNO_RED_MC_FAILURE = 1029, + /* unused */ + FI_CXI_ERRNO_RED_OTHER = 1030, + /* non-specific reduction error, fatal */ + FI_CXI_ERRNO_RED_LAST = 1031, + + /* collectives EQ join error codes + * highest number error predominates + */ + FI_CXI_ERRNO_JOIN_FIRST = 2048, + FI_CXI_ERRNO_JOIN_MCAST_INUSE = 2048, + /* endpoint already using mcast address */ + FI_CXI_ERRNO_JOIN_HWROOT_INUSE = 2049, + /* endpoint already serving as HWRoot */ + FI_CXI_ERRNO_JOIN_MCAST_INVALID = 2050, + /* mcast address from FM is invalid */ + FI_CXI_ERRNO_JOIN_HWROOT_INVALID = 2051, + /* HWRoot address from FM is invalid */ + FI_CXI_ERRNO_JOIN_CURL_FAILED = 2052, + /* libcurl initiation failed */ + FI_CXI_ERRNO_JOIN_CURL_TIMEOUT = 2053, + /* libcurl timed out */ + FI_CXI_ERRNO_JOIN_SERVER_ERR = 2054, + /* unhandled CURL response code */ + FI_CXI_ERRNO_JOIN_FAIL_PTE = 2055, + /* libfabric PTE allocation failed */ + FI_CXI_ERRNO_JOIN_OTHER = 2056, + /* non-specific JOIN error, fatal */ + FI_CXI_ERRNO_JOIN_LAST = FI_CXI_ERRNO_JOIN_FIRST + 43, + /* LAST is determined by the 43-bit error mask . + * Result is the OR of all bits set by different endpoints. + * This reserves space for all 43 bits for new errors. + */ +}; + typedef unsigned int cxip_coll_op_t; // CXI collective opcode struct cxip_coll_mcast_key { diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index be5446e517c..0b8f0d4867b 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -35,7 +35,7 @@ _Static_assert(CXIP_AMO_MAX_IOV == 1, "Unexpected max IOV #"); /** * Data type codes for all of the supported fi_datatype values. */ -static enum c_atomic_type _cxip_amo_type_code[OFI_DATATYPE_LAST] = { +static enum c_atomic_type _cxip_amo_type_code[] = { [FI_INT8] = C_AMO_TYPE_INT8_T, [FI_UINT8] = C_AMO_TYPE_UINT8_T, [FI_INT16] = C_AMO_TYPE_INT16_T, @@ -48,13 +48,15 @@ static enum c_atomic_type _cxip_amo_type_code[OFI_DATATYPE_LAST] = { [FI_DOUBLE] = C_AMO_TYPE_DOUBLE_T, [FI_FLOAT_COMPLEX] = C_AMO_TYPE_FLOAT_COMPLEX_T, [FI_DOUBLE_COMPLEX] = C_AMO_TYPE_DOUBLE_COMPLEX_T, + /* Only 128-bit op suppported is FI_CSWAP, so FI_INT128 should work. */ + [FI_INT128] = C_AMO_TYPE_UINT128_T, + [FI_UINT128] = C_AMO_TYPE_UINT128_T, }; -//TODO: C_AMO_TYPE_UINT128_T /** * AMO operation codes for all of the fi_op values. */ -static enum c_atomic_op _cxip_amo_op_code[OFI_ATOMIC_OP_LAST] = { +static enum c_atomic_op _cxip_amo_op_code[FI_ATOMIC_OP_LAST] = { [FI_MIN] = C_AMO_OP_MIN, [FI_MAX] = C_AMO_OP_MAX, [FI_SUM] = C_AMO_OP_SUM, @@ -82,7 +84,7 @@ static enum c_atomic_op _cxip_amo_op_code[OFI_ATOMIC_OP_LAST] = { /** * AMO swap operation codes for the CSWAP comparison conditions. */ -static enum c_cswap_op _cxip_amo_swpcode[OFI_ATOMIC_OP_LAST] = { +static enum c_cswap_op _cxip_amo_swpcode[FI_ATOMIC_OP_LAST] = { [FI_CSWAP] = C_AMO_OP_CSWAP_EQ, [FI_CSWAP_NE] = C_AMO_OP_CSWAP_NE, [FI_CSWAP_LE] = C_AMO_OP_CSWAP_LE, @@ -96,7 +98,7 @@ static enum c_cswap_op _cxip_amo_swpcode[OFI_ATOMIC_OP_LAST] = { * correspond to the 14 possible fi_datatype values. The OP_VALID() macro will * return a 1 if the (request,op,dt) triple is supported by Cassini. */ -static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][OFI_ATOMIC_OP_LAST] = { +static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][FI_ATOMIC_OP_LAST] = { [CXIP_RQ_AMO] = { [FI_MIN] = 0x03ff, @@ -126,7 +128,7 @@ static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][OFI_ATOMIC_OP_LAST] = { }, [CXIP_RQ_AMO_SWAP] = { - [FI_CSWAP] = 0x0fff, + [FI_CSWAP] = 0xcfff, [FI_CSWAP_NE] = 0x0fff, [FI_CSWAP_LE] = 0x03ff, [FI_CSWAP_LT] = 0x03ff, @@ -175,8 +177,8 @@ int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt, int opcode; int dtcode; - if (dt < 0 || dt >= OFI_DATATYPE_LAST || - op < 0 || op >= OFI_ATOMIC_OP_LAST) + if (dt < 0 || dt >= ARRAY_SIZE(_cxip_amo_type_code) || + op < 0 || op >= FI_ATOMIC_OP_LAST) return -FI_EINVAL; if (!OP_VALID(req_type, op, dt)) @@ -448,7 +450,7 @@ static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) TXC_WARN_RET(txc, ret, "Failed to report error\n"); } - ofi_atomic_dec32(&req->amo.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->amo.txc); cxip_evtq_req_free(req); return FI_SUCCESS; @@ -610,8 +612,9 @@ static int cxip_amo_emit_idc(struct cxip_txc *txc, if (result_mr) { result_md = result_mr->md; } else { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN_RET(txc, ret, "Failed to map result buffer\n"); @@ -928,8 +931,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, /* Optionally register result MR. */ if (result) { if (!result_mr) { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN(txc, "Failed to map result buffer: %d:%s\n", @@ -1015,8 +1019,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, buf_md = buf_mr->md; } else { /* Map user operand buffer for DMA command. */ - ret = cxip_map(dom, buf, atomic_type_len, 0, - &req->amo.oper1_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, + atomic_type_len, 0, + &req->amo.oper1_md); if (ret) { TXC_WARN(txc, "Failed to map operand buffer: %d:%s\n", diff --git a/prov/cxi/src/cxip_av.c b/prov/cxi/src/cxip_av.c index 6dd4aa4e415..031bf0fb22e 100644 --- a/prov/cxi/src/cxip_av.c +++ b/prov/cxi/src/cxip_av.c @@ -229,6 +229,18 @@ struct cxip_addr *(*cxip_av_addr_in)(const void *addr) = insert_in; void (*cxip_av_addr_out)(struct cxip_addr *addr_out, struct cxip_addr *addr) = insert_out; +static fi_addr_t cxip_get_addr(struct fi_peer_rx_entry *entry) +{ + uint32_t ux_init; + uint16_t vni; + struct cxip_ux_send *ux = entry->peer_context; + + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + return cxip_recv_req_src_addr(ux->rxc, ux_init, vni, true); +} + static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -236,6 +248,7 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, size_t i; size_t success_cnt = 0; int ret; + struct fid_peer_srx *owner_srx; ret = cxip_av_insert_validate_args(fid, addr_in, count, fi_addr, flags, context); @@ -253,6 +266,10 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, cxip_av_unlock(av); + owner_srx = av->domain->owner_srx; + if (owner_srx) + owner_srx->owner_ops->foreach_unspec_addr(owner_srx, &cxip_get_addr); + return success_cnt; } diff --git a/prov/cxi/src/cxip_cmdq.c b/prov/cxi/src/cxip_cmdq.c index d2fae71c92b..b60eb06231c 100644 --- a/prov/cxi/src/cxip_cmdq.c +++ b/prov/cxi/src/cxip_cmdq.c @@ -25,19 +25,13 @@ enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass) } } -static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, +static int cxip_cp_find(struct cxip_lni *lni, uint16_t vni, enum cxi_traffic_class tc, enum cxi_traffic_class_type tc_type, struct cxi_cp **cp) { - int ret; - int i; struct cxip_remap_cp *sw_cp; - static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT; - ofi_spin_lock(&lni->lock); - - /* Always prefer SW remapped CPs over allocating HW CP. */ dlist_foreach_container(&lni->remap_cps, struct cxip_remap_cp, sw_cp, remap_entry) { if (sw_cp->remap_cp.vni == vni && sw_cp->remap_cp.tc == tc && @@ -47,10 +41,40 @@ static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, cxi_tc_to_str(sw_cp->remap_cp.tc), cxi_tc_type_to_str(sw_cp->remap_cp.tc_type)); *cp = &sw_cp->remap_cp; - goto success_unlock; + return FI_SUCCESS; } } + return -FI_ENOENT; +} + +static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxi_cp **cp) +{ + int ret; + int i; + struct cxip_remap_cp *sw_cp; + static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT; + + /* Always prefer SW remapped CPs over allocating HW CP. */ + pthread_rwlock_rdlock(&lni->cp_lock); + ret = cxip_cp_find(lni, vni, tc, tc_type, cp); + pthread_rwlock_unlock(&lni->cp_lock); + + if (ret == FI_SUCCESS) + return FI_SUCCESS; + + /* Need to repeat search with write lock held to ensure no CPs have + * been added in threaded env. + */ + pthread_rwlock_wrlock(&lni->cp_lock); + ret = cxip_cp_find(lni, vni, tc, tc_type, cp); + + if (ret == FI_SUCCESS) + goto success_unlock; + /* Allocate a new SW remapped CP entry and attempt to allocate the * user requested HW CP. */ @@ -113,14 +137,14 @@ static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, *cp = &sw_cp->remap_cp; success_unlock: - ofi_spin_unlock(&lni->lock); + pthread_rwlock_unlock(&lni->cp_lock); return FI_SUCCESS; err_free_sw_cp: free(sw_cp); err_unlock: - ofi_spin_unlock(&lni->lock); + pthread_rwlock_unlock(&lni->cp_lock); return ret; } @@ -144,6 +168,7 @@ int cxip_cmdq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, ret = cxi_cq_emit_cq_lcid(cmdq->dev_cmdq, cp->lcid); if (ret) { CXIP_DBG("Failed to update CMDQ(%p) CP: %d\n", cmdq, ret); + cxi_cq_ring(cmdq->dev_cmdq); ret = -FI_EAGAIN; } else { ret = FI_SUCCESS; @@ -241,6 +266,7 @@ int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq, ret = cxi_cq_emit_c_state(cmdq->dev_cmdq, c_state); if (ret) { CXIP_DBG("Failed to issue C_STATE command: %d\n", ret); + cxi_cq_ring(cmdq->dev_cmdq); return -FI_EAGAIN; } @@ -262,7 +288,8 @@ int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -270,17 +297,26 @@ int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, put, buf, len); if (ret) { CXIP_WARN("Failed to emit idc_put command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, @@ -293,7 +329,8 @@ int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -301,10 +338,19 @@ int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, if (ret) { CXIP_WARN("Failed to emit dma command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, @@ -333,7 +379,8 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -341,7 +388,7 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } /* Fetching AMO with flush requires two commands. Ensure there is enough @@ -349,13 +396,15 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, */ if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } ret = cxi_cq_emit_idc_amo(cmdq->dev_cmdq, amo, fetching); if (ret) { CXIP_WARN("Failed to emit IDC amo\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } if (fetching_flush) { @@ -367,6 +416,14 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, @@ -394,7 +451,8 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -403,13 +461,15 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, */ if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } ret = cxi_cq_emit_dma_amo(cmdq->dev_cmdq, amo, fetching); if (ret) { CXIP_WARN("Failed to emit DMA amo\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } if (fetching_flush) { @@ -421,6 +481,14 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, @@ -435,7 +503,8 @@ int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -443,15 +512,24 @@ int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, msg, buf, len); if (ret) { CXIP_WARN("Failed to emit idc_msg command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } diff --git a/prov/cxi/src/cxip_cntr.c b/prov/cxi/src/cxip_cntr.c index 8a0989b479e..c94933fbd97 100644 --- a/prov/cxi/src/cxip_cntr.c +++ b/prov/cxi/src/cxip_cntr.c @@ -56,20 +56,33 @@ static int cxip_cntr_get_ct_error(struct cxip_cntr *cntr, uint64_t *error) struct c_ct_writeback wb_copy; int ret; - /* Only can reference the ct_failure field directly if dealing with - * system memory. Device memory requires a memcpy of the contents into - * system memory. - */ if (cntr->wb_iface == FI_HMEM_SYSTEM) { - *error = cntr->wb->ct_failure; - return FI_SUCCESS; + do { + if (cntr->wb->ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *error = cntr->wb->ct_failure; + return -FI_SUCCESS; + } + sched_yield(); + } while (true); } - ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); - if (ret) - return ret; + /* Device memory requires a memcpy of the contents into + * system memory. + */ + do { + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + if (wb_copy.ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *error = wb_copy.ct_failure; + return -FI_SUCCESS; + } + sched_yield(); + } while (true); - *error = wb_copy.ct_failure; return FI_SUCCESS; } @@ -78,20 +91,33 @@ static int cxip_cntr_get_ct_success(struct cxip_cntr *cntr, uint64_t *success) struct c_ct_writeback wb_copy; int ret; - /* Only can reference the ct_success field directly if dealing with - * system memory. Device memory requires a memcpy of the contents into - * system memory. - */ if (cntr->wb_iface == FI_HMEM_SYSTEM) { - *success = cntr->wb->ct_success; - return FI_SUCCESS; + do { + if (cntr->wb->ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *success = cntr->wb->ct_success; + return FI_SUCCESS; + } + sched_yield(); + } while (true); } - ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); - if (ret) - return ret; + /* Device memory requires a memcpy of the contents into + * system memory. + */ + do { + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + if (wb_copy.ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *success = wb_copy.ct_success; + return FI_SUCCESS; + } + sched_yield(); + } while (true); - *success = wb_copy.ct_success; return FI_SUCCESS; } @@ -306,6 +332,7 @@ int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set, return FI_SUCCESS; } +/* Caller must hold cntr->lock */ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) { int ret; @@ -313,8 +340,6 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) /* The calling thread which changes CT writeback bit from 1 to 0 must * issue a CT get command. */ - ofi_mutex_lock(&cntr->lock); - ret = cxip_cntr_get_ct_writeback(cntr); if (ret < 0) { CXIP_WARN("Failed to read counter writeback: rc=%d\n", ret); @@ -334,8 +359,6 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) *issue_ct_get = false; } - ofi_mutex_unlock(&cntr->lock); - return FI_SUCCESS; err_unlock: @@ -351,6 +374,8 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) * Schedule hardware to write the value of a counter to memory. Avoid * scheduling multiple write-backs at once. The counter value will appear in * memory a small amount of time later. + * + * Caller must hold cntr->lock */ static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force) { @@ -367,7 +392,7 @@ static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force) return ret; } - if (!issue_ct_get) + if (!issue_ct_get && cxi_cntr->attr.flags & FI_CXI_CNTR_CACHED) return FI_SUCCESS; } @@ -422,10 +447,13 @@ static uint64_t cxip_cntr_read(struct fid_cntr *fid_cntr) cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); cxip_cntr_progress(cxi_cntr); + + ofi_mutex_lock(&cxi_cntr->lock); cxip_cntr_get(cxi_cntr, false); - /* TODO: Fall back to reading register on error? */ ret = cxip_cntr_get_ct_success(cxi_cntr, &success); + ofi_mutex_unlock(&cxi_cntr->lock); + if (ret != FI_SUCCESS) CXIP_WARN("Failed to read counter success: rc=%d\n", ret); @@ -444,10 +472,13 @@ static uint64_t cxip_cntr_readerr(struct fid_cntr *fid_cntr) cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); cxip_cntr_progress(cxi_cntr); + + ofi_mutex_lock(&cxi_cntr->lock); cxip_cntr_get(cxi_cntr, false); - /* TODO: Fall back to reading register on error? */ ret = cxip_cntr_get_ct_error(cxi_cntr, &error); + ofi_mutex_unlock(&cxi_cntr->lock); + if (ret != FI_SUCCESS) CXIP_WARN("Failed to read counter error: rc=%d\n", ret); @@ -746,9 +777,11 @@ int cxip_set_wb_buffer(struct fid *fid, void *buf, size_t len) } /* Force a counter writeback into the user's provider buffer. */ + ofi_mutex_lock(&cntr->lock); do { ret = cxip_cntr_get(cntr, true); } while (ret == -FI_EAGAIN); + ofi_mutex_unlock(&cntr->lock); return ret; } @@ -825,7 +858,7 @@ static int cxip_cntr_verify_attr(struct fi_cntr_attr *attr) return -FI_ENOSYS; } - if (attr->flags) + if (attr->flags & ~FI_CXI_CNTR_CACHED) return -FI_ENOSYS; return FI_SUCCESS; diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 40ef8a60f59..2bff682369f 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -3,7 +3,7 @@ * * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2025 Hewlett Packard Enterprise Development LP * Support for accelerated collective reductions. */ @@ -31,8 +31,6 @@ #define TRACE_PKT(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_PKT, fmt, \ ##__VA_ARGS__) -#define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ - ##__VA_ARGS__) #define TRACE_JOIN(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_JOIN, fmt, \ ##__VA_ARGS__) #define TRACE_DEBUG(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \ @@ -50,6 +48,56 @@ #define MAGIC 0x677d +/**************************************************************************** + * Metrics for evaluating collectives + */ + +struct cxip_coll_metrics_loc { + ofi_atomic64_t red_count_bad; + ofi_atomic64_t red_count_full; + ofi_atomic64_t red_count_partial; + ofi_atomic64_t red_count_unreduced; + struct cxip_coll_metrics_ep ep_data; +}; +static struct cxip_coll_metrics_loc _coll_metrics; + +void cxip_coll_init_metrics(void) +{ + ofi_atomic_initialize64(&_coll_metrics.red_count_bad, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_full, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_partial, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_unreduced, 0); + memset(&_coll_metrics.ep_data, 0, sizeof(_coll_metrics.ep_data)); +} + +void cxip_coll_get_metrics(struct cxip_coll_metrics *metrics) +{ + metrics->red_count_bad = + ofi_atomic_get64(&_coll_metrics.red_count_bad); + metrics->red_count_full = + ofi_atomic_get64(&_coll_metrics.red_count_full); + metrics->red_count_partial = + ofi_atomic_get64(&_coll_metrics.red_count_partial); + metrics->red_count_unreduced = + ofi_atomic_get64(&_coll_metrics.red_count_unreduced); + memcpy(&metrics->ep_data, &_coll_metrics.ep_data, + sizeof(struct cxip_coll_metrics_ep)); +} + +static inline void _measure_completions(int red_cnt, size_t total) +{ + if (red_cnt >= total) + ofi_atomic_inc64(&_coll_metrics.red_count_bad); + else if (red_cnt == total-1) + ofi_atomic_inc64(&_coll_metrics.red_count_full); + else if (red_cnt > 1) + ofi_atomic_inc64(&_coll_metrics.red_count_partial); + else if (red_cnt > 0) + ofi_atomic_inc64(&_coll_metrics.red_count_unreduced); + else + ofi_atomic_inc64(&_coll_metrics.red_count_bad); +} + /**************************************************************************** * Reduction packet for hardware accelerated collectives: * @@ -97,7 +145,22 @@ * * retry is a control bit that can be invoked by the hw root node to initiate a * retransmission of the data from the leaves, if packets are lost. + * + * A re-arm of an armed switch port may not clear the data in the port, + * resulting in incorrect results. Arming twice will guarantee that the + * old data is cleared. + * + * To disambiguate these two arming packets, it is recommended that the first + * arm use a reserved sequence number, allowing the software to receive the + * first arm packet (and data), identify it as a pre-emptive arm, and discard + * it. + * + * The sequence numbers occupy 10 bits of the packet header. The sequence + * numbers are monotonically incremented modulo ((1 << 10)-1), meaning that + * the largest sequence number will be ((1 << 10)-2). The unreachable value + * of ((1 << 10)-1) is designated the reserved value for pre-emptive arming. */ + struct cxip_coll_cookie { uint32_t mcast_id:13; uint32_t red_id:3; @@ -314,7 +377,6 @@ static cxip_coll_op_t _uint8_16_32_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _int64_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _uint64_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _flt_op_to_opcode[FI_CXI_OP_LAST]; -static enum c_return_code _cxip_rc_to_cxi_rc[16]; static enum cxip_coll_redtype _cxi_op_to_redtype[COLL_OPCODE_MAX]; /* One-time dynamic initialization of FI to CXI opcode. @@ -323,7 +385,7 @@ void cxip_coll_populate_opcodes(void) { int i; - if ((int)FI_CXI_MINMAXLOC < (int)OFI_ATOMIC_OP_LAST) { + if ((int)FI_CXI_MINMAXLOC < (int)FI_ATOMIC_OP_LAST) { CXIP_FATAL("Invalid CXI_FMINMAXLOC value\n"); } for (i = 0; i < FI_CXI_OP_LAST; i++) { @@ -384,17 +446,6 @@ void cxip_coll_populate_opcodes(void) _cxi_op_to_redtype[COLL_OPCODE_INT_MINMAXLOC] = REDTYPE_IMINMAX; _cxi_op_to_redtype[COLL_OPCODE_FLT_MINMAXNUMLOC] = REDTYPE_FMINMAX; _cxi_op_to_redtype[COLL_OPCODE_FLT_REPSUM] = REDTYPE_REPSUM; - - for (i = 0; i < 16; i++) - _cxip_rc_to_cxi_rc[i] = C_RC_AMO_ALIGN_ERROR; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_SUCCESS] = C_RC_OK; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INEXACT] = C_RC_AMO_FP_INEXACT; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INVALID] = C_RC_AMO_FP_INVALID; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_REP_INEXACT] = C_RC_AMO_FP_INEXACT; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_INT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_CONTR_OVERFLOW] = C_RC_AMO_LENGTH_ERROR; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_OP_MISMATCH] = C_RC_AMO_INVAL_OP_ERROR; } static inline int int8_16_32_op_to_opcode(int op) @@ -830,7 +881,7 @@ static void _coll_rx_req_report(struct cxip_req *req) } else { /* non-reduction packet */ err = FI_ENOMSG; - CXIP_INFO("Not reduction pkt: %p (err: %d, %s)\n", + CXIP_WARN("Not reduction pkt: %p (err: %d, %s)\n", req, err, cxi_rc_to_str(err)); } @@ -930,6 +981,11 @@ static void _coll_rx_progress(struct cxip_req *req, return; } #endif + // A re-arm of an armed switch port drop this packet + if (pkt->hdr.seqno == CXIP_COLL_MOD_SEQNO) { + CXIP_INFO("pre-rearm pkt dropped\n"); + return; + } /* Progress the reduction */ _dump_red_pkt(pkt, "recv"); @@ -1190,8 +1246,8 @@ static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size, ret = -FI_ENOMEM; goto out; } - ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer, - size, 0, &buf->cxi_md); + ret = cxip_ep_obj_map(coll_pte->ep_obj, (void *)buf->buffer, + size, 0, &buf->cxi_md); if (ret) goto del_msg; buf->bufsiz = size; @@ -1247,7 +1303,7 @@ bool _quiesce_nan(double *d) } /** - * Implement NaN comparison in RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM + * Implement NaN comparisons FLT_MINNUM and FLT_MAXNUM * * Only associative mode is supported. The old IEEE mode is incorrect, and has * been deprecated. @@ -1324,35 +1380,6 @@ void swpidx(uint64_t *i1, uint64_t i2, int swp) *i1 = i2; } -/* Determine if double precision sum is exact. This shifts the value with the - * lower exponent toward the MSBit by the amount of the bitwise overlap between - * the final sum and the value that resulted in that sum. If any non-zero bits - * remain in that smaller value, they were discarded during the summation, and - * the result is inexact. - */ -static inline -bool exact(double rslt, double d) -{ - // TODO verify sign and shift - unsigned long m1, m2; - int s1, e1, s2, e2; - int shft, dlte; - bool ret; - - _decompose_dbl(rslt, &s1, &e1, &m1); - _decompose_dbl(d, &s2, &e2, &m2); - dlte = e1 - e2; - - if (dlte < 0) { - shft = MIN(52 + dlte, 0); - ret = !(m1 << shft); - } else { - shft= MIN(52 - dlte, 0); - ret = !(m2 << shft); - } - return ret; -} - static inline void _dump_coll_data(const char *tag, const struct cxip_coll_data *coll_data) { @@ -1488,7 +1515,6 @@ static void _reduce(struct cxip_coll_data *accum, /* overflow not possible */ break; case COLL_OPCODE_INT_MINMAXLOC: - /* RSDG 4.5.9.2.2 MINMAXLOC */ /* return smallest value and its index */ if (accum->intminmax.iminval > coll_data->intminmax.iminval) { accum->intminmax.iminval = coll_data->intminmax.iminval; @@ -1526,21 +1552,18 @@ static void _reduce(struct cxip_coll_data *accum, } break; case COLL_OPCODE_FLT_MINNUM: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ for (i = 0; i < 4; i++) { swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 1, &accum->red_rc); } break; case COLL_OPCODE_FLT_MAXNUM: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ for (i = 0; i < 4; i++) { swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 0, &accum->red_rc); } break; case COLL_OPCODE_FLT_MINMAXNUMLOC: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ swp = swpnan2(&accum->fltminmax.fminval, coll_data->fltminmax.fminval, 1, &accum->red_rc); swpidx(&accum->fltminmax.fminidx, coll_data->fltminmax.fminidx, swp); @@ -1560,10 +1583,6 @@ static void _reduce(struct cxip_coll_data *accum, /* NOTE: arithmetic operations will quiesce snan */ accum->fltval.fval[i] += coll_data->fltval.fval[i]; - if (!exact(accum->fltval.fval[i], - coll_data->fltval.fval[i])) - SET_RED_RC(accum->red_rc, - CXIP_COLL_RC_FLT_INEXACT); if (isinf(accum->fltval.fval[i])) SET_RED_RC(accum->red_rc, CXIP_COLL_RC_FLT_OVERFLOW); @@ -1581,10 +1600,6 @@ static void _reduce(struct cxip_coll_data *accum, /* NOTE: arithmetic operations will quiesce snan */ accum->fltval.fval[i] += coll_data->fltval.fval[i]; - if (!exact(accum->fltval.fval[i], - coll_data->fltval.fval[i])) - SET_RED_RC(accum->red_rc, - CXIP_COLL_RC_FLT_INEXACT); if (isinf(accum->fltval.fval[i])) SET_RED_RC(accum->red_rc, CXIP_COLL_RC_FLT_OVERFLOW); @@ -1727,7 +1742,7 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, bool arm, bool retry) { struct red_pkt *pkt; - int ret; + int ret = FI_SUCCESS; pkt = (struct red_pkt *)reduction->tx_msg; @@ -1758,13 +1773,33 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, pkt->hdr.repsum_ovflid = 0; memset(pkt->data, 0, CXIP_COLL_MAX_DATA_SIZE); } - _dump_red_pkt(pkt, "send"); - _swappkt(pkt); - /* -FI_EAGAIN means HW queue is full, should self-clear */ - do { - ret = _send_pkt(reduction); - } while (ret == -FI_EAGAIN); + // A re-arm of an armed switch port send clearing packet + if (arm && retry) { + int save_seqno = pkt->hdr.seqno; + + // A re-arm of an armed switch port skip illegal value + pkt->hdr.seqno = CXIP_COLL_MOD_SEQNO; + _dump_red_pkt(pkt, "retry"); + _swappkt(pkt); + do { + /* -FI_EAGAIN means HW queue is full, self-clears */ + ret = _send_pkt(reduction); + } while (ret == -FI_EAGAIN); + _swappkt(pkt); + pkt->hdr.seqno = save_seqno; + } + + if (ret == FI_SUCCESS) { + _dump_red_pkt(pkt, "send"); + _swappkt(pkt); + do { + /* -FI_EAGAIN means HW queue is full, self-clears */ + ret = _send_pkt(reduction); + } while (ret == -FI_EAGAIN); + _swappkt(pkt); + } + /* any other error is a serious config/hardware issue */ if (ret) CXIP_WARN("Fatal send error = %d\n", ret); @@ -1776,28 +1811,53 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, static void _post_coll_complete(struct cxip_coll_reduction *reduction) { struct cxip_req *req; - int ret; + int ret, prov; /* Indicates collective completion by writing to the endpoint TX CQ */ req = reduction->op_inject_req; if (!req) return; - if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS) { + /* convert Rosetta return codes to CXIP return codes */ + if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_INEXACT || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_INVALID || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_OVERFLOW) { + switch (reduction->accum.red_rc) { + case CXIP_COLL_RC_FLT_INEXACT: + CXIP_WARN("coll reduce FLT result was rounded\n"); + break; + case CXIP_COLL_RC_FLT_INVALID: + CXIP_WARN("coll reduce FLT invalid\n"); + break; + case CXIP_COLL_RC_FLT_OVERFLOW: + CXIP_WARN("coll reduce FLT overflow\n"); + break; + default: + break; + } ret = cxip_cq_req_complete(req); } else { - ret = cxip_cq_req_error(req, 0, - _cxip_rc_to_cxi_rc[reduction->accum.red_rc], - reduction->accum.red_rc, NULL, 0, FI_ADDR_UNSPEC); + switch (reduction->accum.red_rc) { + case CXIP_COLL_RC_INT_OVERFLOW: + prov = FI_CXI_ERRNO_RED_INT_OVERFLOW; + break; + case CXIP_COLL_RC_CONTR_OVERFLOW: + prov = FI_CXI_ERRNO_RED_CONTR_OVERFLOW; + break; + case CXIP_COLL_RC_OP_MISMATCH: + prov = FI_CXI_ERRNO_RED_OP_MISMATCH; + break; + default: + prov = FI_CXI_ERRNO_RED_OTHER; + break; + } + ret = cxip_cq_req_error(req, 0, -FI_EOTHER, prov, + NULL, 0, FI_ADDR_UNSPEC); } - if (ret) { - /* Is this possible? The only error is -FI_ENOMEM. It looks like - * send is blocked with -FI_EAGAIN until we are guaranteed EQ - * space in the queue. Display and ignore. - */ - CXIP_WARN("Attempt to post completion failed %s\n", + if (ret) + CXIP_FATAL("Attempt to post completion failed %s\n", fi_strerror(-ret)); - } /* req structure no longer needed */ cxip_evtq_req_free(req); @@ -1880,12 +1940,19 @@ static void _unpack_red_data(struct cxip_coll_data *coll_data, #define DECMOD(val, mod) do {(val)=((val)+(mod)-1)%(mod);} while (0) /* MONOTONIC timestamp operations for timeouts/retries */ + +/* get current time */ static inline void _tsget(struct timespec *ts) { - clock_gettime(CLOCK_MONOTONIC, ts); + uint64_t ns; + + ns = ofi_gettime_ns(); + ts->tv_sec = ns / 1000000000; + ts->tv_nsec = ns % 1000000000; } +/* advance time by delta */ static inline void _tsadd(struct timespec *ts, const struct timespec *dt) { @@ -1897,42 +1964,66 @@ void _tsadd(struct timespec *ts, const struct timespec *dt) } } -/* Set a timespec at expiration time (future) */ +/* set current time plus increment */ +static inline +void _tsset(struct timespec *ts, const struct timespec *dt) +{ + _tsget(ts); + _tsadd(ts, dt); +} + +/* test for expiration of time */ +static inline +bool _tsexp(struct timespec *ts) +{ + struct timespec tsnow; + + _tsget(&tsnow); + TRACE_JOIN("now=%ld.%ld exp=%ld.%ld\n", + tsnow.tv_sec, tsnow.tv_nsec, + ts->tv_sec, ts->tv_nsec); + if (tsnow.tv_sec < ts->tv_sec) + return false; + if (tsnow.tv_sec > ts->tv_sec) + return true; + return (tsnow.tv_nsec >= ts->tv_nsec); +} + +/* test for {0,0} timestamp */ static inline -void _tsset(struct cxip_coll_reduction *reduction) +bool _tsnul(struct timespec *ts) { - _tsget(&reduction->tv_expires); - _tsadd(&reduction->tv_expires, &reduction->mc_obj->timeout); + return !(ts->tv_sec | ts->tv_nsec); +} + +/* Set reduction expiration time (future) */ +static inline +void _ts_red_set(struct cxip_coll_reduction *reduction) +{ + _tsset(&reduction->tv_expires, &reduction->mc_obj->timeout); } /* Used to prevent first-use incast */ static inline bool _is_red_first_time(struct cxip_coll_reduction *reduction) { - return (reduction->tv_expires.tv_sec == 0L && - reduction->tv_expires.tv_nsec == 0L); + return _tsnul(&reduction->tv_expires); } /* Used to reduce incast congestion during run */ static inline bool _is_red_timed_out(struct cxip_coll_reduction *reduction) { - struct timespec tsnow; - if (reduction->mc_obj->retry_disable) return false; if (_is_red_first_time(reduction)) { - TRACE_DEBUG("=== root first time, retry\n"); + TRACE_DEBUG("=== root redid=%d first time, retry\n", + reduction->red_id); return true; } - _tsget(&tsnow); - if (tsnow.tv_sec < reduction->tv_expires.tv_sec) - return false; - if (tsnow.tv_sec == reduction->tv_expires.tv_sec && - tsnow.tv_nsec < reduction->tv_expires.tv_nsec) - return false; - TRACE_DEBUG("=== root timeout, retry\n"); - return true; + + /* disable timeout logic for now */ + return false; } /* Root node state machine progress. @@ -1954,12 +2045,14 @@ static void _progress_root(struct cxip_coll_reduction *reduction, if (_is_red_timed_out(reduction)) { /* reset reduction for retry send */ reduction->seqno = mc_obj->seqno; - INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + TRACE_PKT("root T/O reduction seqno = %d\n", reduction->seqno); + INCMOD(mc_obj->seqno, CXIP_COLL_MOD_SEQNO); + TRACE_PKT("root T/O mc_obj seqno = %d\n", mc_obj->seqno); ofi_atomic_inc32(&mc_obj->tmout_cnt); ret = cxip_coll_send_red_pkt(reduction, NULL, !mc_obj->arm_disable, true); - _tsset(reduction); + _ts_red_set(reduction); if (ret) { SET_RED_RC(reduction->accum.red_rc, CXIP_COLL_RC_TX_FAILURE); @@ -1971,9 +2064,6 @@ static void _progress_root(struct cxip_coll_reduction *reduction, /* Process received packet */ if (pkt) { - /* Root has received a leaf packet */ - _dump_red_pkt(pkt, "Rrcv"); - /* Drop out-of-date packets */ if (pkt->hdr.resno != reduction->seqno) { TRACE_DEBUG("bad seqno, exp=%d saw=%d\n", @@ -1982,8 +2072,14 @@ static void _progress_root(struct cxip_coll_reduction *reduction, return; } - /* capture and reduce packet information */ + /* capture packet information */ _unpack_red_data(&coll_data, pkt); +#if ENABLE_DEBUG + /* capture completion metrics */ + _measure_completions(coll_data.red_cnt, + mc_obj->av_set_obj->fi_addr_cnt); +#endif + /* perform the reduction */ _reduce(&reduction->accum, &coll_data, false); _dump_coll_data("after leaf contrib to root", &reduction->accum); } @@ -1999,12 +2095,13 @@ static void _progress_root(struct cxip_coll_reduction *reduction, /* send reduction result to leaves, arm new seqno */ reduction->seqno = mc_obj->seqno; - INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + INCMOD(mc_obj->seqno, CXIP_COLL_MOD_SEQNO); reduction->completed = true; + TRACE_DEBUG("root send seqno = %d\n", reduction->seqno); ret = cxip_coll_send_red_pkt(reduction, &reduction->accum, !mc_obj->arm_disable, false); - _tsset(reduction); + _ts_red_set(reduction); if (ret) SET_RED_RC(reduction->accum.red_rc, CXIP_COLL_RC_TX_FAILURE); @@ -2040,19 +2137,21 @@ static void _progress_leaf(struct cxip_coll_reduction *reduction, /* if reduction packet, reset timer, seqno, honor retry */ if (pkt) { - _dump_red_pkt(pkt, "Lrcv"); - _tsset(reduction); + TRACE_DEBUG("%s: packet seen\n", __func__); + _ts_red_set(reduction); reduction->seqno = pkt->hdr.seqno; reduction->resno = pkt->hdr.seqno; if (pkt->hdr.retry) reduction->pktsent = false; + TRACE_PKT("leaf rcv seqno = %d\n", reduction->seqno); } /* leaves lead with sending a packet */ if (!reduction->pktsent) { /* Avoid first-use incast, retry guaranteed */ if (_is_red_first_time(reduction)) { - TRACE_DEBUG("=== leaf first time, wait\n"); + TRACE_DEBUG("=== leaf redid=%d first time, wait\n", + reduction->red_id); return; } @@ -2418,15 +2517,19 @@ union pack_mcast { uint64_t mcast_addr: 16;// maximum anticipated multicast uint64_t hwroot_idx: 27;// 128M endpoints in tree uint64_t valid: 1; // success flag - uint64_t pad: 20; // needed by zbcoll + uint64_t pad: 20; // used by zbcoll } __attribute__((__packed__)); +} __attribute__((__packed__)); + +union pack_errbits { + uint64_t uint64; struct { uint64_t error_bits: 43;// up to 43 independent errors - uint64_t valid1: 1; // unused/reserved - uint64_t pad1: 20; // unused/reserved + uint64_t valid: 1; // success flag + uint64_t pad1: 20; // needed by zbcoll } __attribute__((__packed__)); -}; +} __attribute__((__packed__)); /* State structure for carrying data through the join sequence */ struct cxip_join_state { @@ -2434,10 +2537,12 @@ struct cxip_join_state { struct cxip_av_set *av_set_obj; // av set for this collective struct cxip_coll_mc *mc_obj; // mc object for this collective struct cxip_zbcoll_obj *zb; // zb object associated with state + struct timespec curlexpires; // multicast creation expiration timeout struct fid_mc **mc; // user pointer to return mc_obj void *context; // user context for concurrent joins uint64_t join_flags; // user-supplied libfabric join flags union pack_mcast bcast_data; // packed multicast data + union pack_errbits reduce_err; // packed join error bits bool rx_discard; // set if RX events should be discarded bool is_rank; // set if using COLL_RANK simulation model bool is_mcast; // set if using Rosetta multicast tree @@ -2455,49 +2560,65 @@ struct cxip_join_state { }; /* State structure for recovering data from CURL response */ -struct cxip_curl_mcast_usrptr { +struct cxip_curl_mcast_create_usrptr { struct cxip_join_state *jstate; // join state int mcast_id; // multicast address int hwroot_rank; // hardware root index }; +struct cxip_curl_mcast_delete_usrptr { + struct cxip_coll_mc *mc_obj; // multicast object +}; + /* pack provider errors into AND bitmask - address data */ void _proverr_to_bits(struct cxip_join_state *jstate) { int bitno; /* record error as a bit for this endpoint */ - jstate->bcast_data.error_bits = 0L; - if (!jstate->bcast_data.valid) { - bitno = -jstate->prov_errno; - jstate->bcast_data.error_bits |= (1L << bitno); + TRACE_JOIN("%s: prov_errno=%d\n", __func__, jstate->prov_errno); + jstate->reduce_err.error_bits = 0L; + if (jstate->prov_errno) { + if (jstate->prov_errno >= FI_CXI_ERRNO_JOIN_LAST) + jstate->prov_errno = FI_CXI_ERRNO_JOIN_OTHER; + bitno = jstate->prov_errno - FI_CXI_ERRNO_JOIN_FIRST; + jstate->reduce_err.error_bits |= (1L << bitno); } /* invert bits, zbcoll reduce does AND */ - jstate->bcast_data.error_bits ^= -1L; + TRACE_JOIN("%s: error bitmask=%016lx\n", __func__, + (uint64_t)jstate->reduce_err.error_bits); + jstate->reduce_err.error_bits ^= -1L; } -/* unpack AND bitmask into dominant provider error */ +/* unpack bitmask and return largest error */ void _bits_to_proverr(struct cxip_join_state *jstate) { - int bitno; + int prov_errno; + uint64_t bitmask; /* zbcoll reduce does AND, invert bits */ - jstate->bcast_data.error_bits ^= -1L; - - /* if data is valid, bits do not represent errors */ - if (jstate->bcast_data.valid) { - jstate->prov_errno = CXIP_PROV_ERRNO_OK; + jstate->reduce_err.error_bits ^= -1L; + TRACE_JOIN("%s: error bitmask=%016lx\n", __func__, + (uint64_t)jstate->reduce_err.error_bits); + + /* display all errors, capture the highest value error */ + jstate->prov_errno = 0L; + if (!jstate->reduce_err.error_bits) { + TRACE_JOIN("%s: no error seen\n", __func__); return; } - /* bits set represent multiple errors from endpoints */ - for (bitno = -CXIP_PROV_ERRNO_OK; bitno < -CXIP_PROV_ERRNO_LAST; bitno++) { - if (jstate->bcast_data.error_bits & (1 << bitno)) { - jstate->prov_errno = -bitno; - CXIP_WARN("join error %d seen\n", jstate->prov_errno); + bitmask = 1L; + for (prov_errno = FI_CXI_ERRNO_JOIN_FIRST; + prov_errno < FI_CXI_ERRNO_JOIN_LAST; + prov_errno++) { + if (jstate->reduce_err.error_bits & bitmask) { + jstate->prov_errno = prov_errno; + CXIP_WARN("%s\n", cxip_strerror(jstate->prov_errno)); + TRACE_JOIN("%s\n", cxip_strerror(jstate->prov_errno)); } + bitmask <<= 1; } - /* returns most significant of multiple errors as jstate->prov_errno */ } /* Close collective pte object - ep_obj->lock must be held */ @@ -2532,8 +2653,10 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, *coll_pte_ret = NULL; coll_pte = calloc(1, sizeof(*coll_pte)); - if (!coll_pte) + if (!coll_pte) { + TRACE_JOIN("out of memory\n"); return -FI_ENOMEM; + } /* initialize coll_pte */ coll_pte->ep_obj = ep_obj; @@ -2546,20 +2669,27 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, ret = cxip_pte_alloc(ep_obj->ptable, ep_obj->coll.rx_evtq->eq, pid_idx, is_mcast, &pt_opts, _coll_pte_cb, coll_pte, &coll_pte->pte); - if (ret) - goto fail; + if (ret) { + TRACE_JOIN("cxip_pte_alloc failed=%d\n", ret); + free(coll_pte); + return ret; + } /* enable the PTE */ ret = _coll_pte_enable(coll_pte, CXIP_PTE_IGNORE_DROPS); - if (ret) + if (ret) { + TRACE_JOIN("_coll_pte_enable failed=%d\n", ret); goto fail; + } /* add buffers to the PTE */ ret = _coll_add_buffers(coll_pte, ep_obj->coll.buffer_size, ep_obj->coll.buffer_count); - if (ret) + if (ret) { + TRACE_JOIN("_coll_add_buffers failed=%d\n", ret); goto fail; + } *coll_pte_ret = coll_pte; return FI_SUCCESS; @@ -2569,13 +2699,24 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, return ret; } +/* forward references for CURL operations */ +static void _create_mcast_addr(struct cxip_join_state *jstate); +static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle); +static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj); +static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle); + /* Close multicast collective object */ -static void _close_mc(struct cxip_coll_mc *mc_obj) +static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete, bool has_error) { int count; if (!mc_obj) return; + TRACE_JOIN("%s starting MC cleanup\n", __func__); + + mc_obj->has_closed = true; + mc_obj->has_error = has_error; + /* clear the mcast_addr -> mc_obj reference*/ ofi_idm_clear(&mc_obj->ep_obj->coll.mcast_map, mc_obj->mcast_addr); mc_obj->ep_obj->coll.is_hwroot = false; @@ -2598,19 +2739,67 @@ static void _close_mc(struct cxip_coll_mc *mc_obj) _close_pte(mc_obj->ep_obj->coll.coll_pte); mc_obj->ep_obj->coll.coll_pte = NULL; } - free(mc_obj); + /* index zero deletes the multicast address */ + if (delete && mc_obj->is_multicast && !mc_obj->mynode_idx) { + struct timespec expires = { + cxip_env.coll_fm_timeout_msec/1000, + (cxip_env.coll_fm_timeout_msec%1000)*1000000}; + + if (!mc_obj->has_error) + mc_obj->close_state = -FI_EAGAIN; + + _tsset(&mc_obj->curlexpires, &expires); + _curl_delete_mc_obj(mc_obj); + } else { + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_SUCCESS; + } + } } +/* The user can close an individual collective MC address. It must do so on + * all endpoints in the collective group, just as fi_join_collective() must + * be called on all endpoints in the group. + */ static int _fi_close_mc(struct fid *fid) { struct cxip_coll_mc *mc_obj; + int ret = FI_SUCCESS; + TRACE_JOIN("%s: closing MC\n", __func__); mc_obj = container_of(fid, struct cxip_coll_mc, mc_fid.fid); - _close_mc(mc_obj); - return FI_SUCCESS; + if (!mc_obj) { + TRACE_JOIN("%s: MC object is null\n", __func__); + return ret; + } else if (mc_obj->has_closed) { + TRACE_JOIN("%s: close already called before\n", __func__); + return ret; + } else if (mc_obj->has_error) { + TRACE_JOIN("%s: encounted an error earlier\n", __func__); + return ret; + } + + _close_mc(mc_obj, true, false); + while (mc_obj && (ret = mc_obj->close_state) == -FI_EAGAIN) { + ret = cxip_curl_progress(NULL); + if (ret == -FI_EAGAIN) { + usleep(10); + continue; + } + if (ret < 0 && ret != -FI_ENODATA) { + TRACE_JOIN("%s: Curl progress failed, error=%d\n", __func__, ret); + break; + } + usleep(10); + } + free(mc_obj); + + return ret; } -/* multicast object operational functions */ +/* multicast object libfabric functions */ static struct fi_ops mc_ops = { .size = sizeof(struct fi_ops), .close = _fi_close_mc, @@ -2678,9 +2867,10 @@ static int _initialize_mc(void *ptr) if (!mc_obj) return -FI_ENOMEM; - TRACE_DEBUG("acquiring PTE\n"); + TRACE_JOIN("acquiring PTE\n"); if (jstate->is_rank) { // NETSIM + TRACE_JOIN("acquiring PTE NETSIM\n"); // pid_idx = simulated collective rank pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank; ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); @@ -2689,11 +2879,13 @@ static int _initialize_mc(void *ptr) } else if (!jstate->is_mcast) { // UNICAST // pid_idx = simulated collective tree + TRACE_JOIN("acquiring PTE UNICAST\n"); pid_idx = CXIP_PTL_IDX_COLL; ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); } else { // MULTICAST // pid_idx = bit-shifted multicast address + TRACE_JOIN("acquiring PTE MULTICAST\n"); memset(&pid_mcast, 0, sizeof(pid_mcast)); pid_mcast.mcast_id = jstate->bcast_data.mcast_addr; pid_mcast.mcast_pte_index = 0; @@ -2829,6 +3021,16 @@ static int _initialize_mc(void *ptr) /* Last field to set */ mc_obj->is_joined = true; + /* Prepare static metrics for this endpoint*/ + _coll_metrics.ep_data.myrank = mc_obj->mynode_idx; + _coll_metrics.ep_data.isroot = + mc_obj->hwroot_idx == mc_obj->mynode_idx; + + /* Initially set close states to success */ + mc_obj->close_state = FI_SUCCESS; + mc_obj->has_closed = false; + mc_obj->has_error = false; + /* Return information to the caller */ jstate->mc_obj = mc_obj; *jstate->mc = &mc_obj->mc_fid; @@ -2838,146 +3040,223 @@ static int _initialize_mc(void *ptr) return FI_SUCCESS; fail: - _close_mc(mc_obj); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_FAIL_PTE; + _close_mc(mc_obj, true, true); return ret; } /** - * CURL callback function upon completion of a request. + * CURL MODEL + * + * void _cxip_action(void *object); + * void _cxip_action_cb(struct cxip_curl_handle *handle); + * + * The action object must persist until the action has reached a conclusion, + * which may involve multiple CURL requests, particularly retries on busy + * responses. It must retain state for multiple retries of the action if the + * CURL response indicates a retry is needed. This is the cxip_join_state + * object for multicast creation, and the mc_obj object for multicast deletion. + * + * The curl_usrptr object is allocated for each CURL request, and deleted after + * the response has been evaluated. The response may be a retry of the same + * CURL request, or it may be some other recovery or completion operation. + * + * This simplifies retries and adaptive responses to the CURL result. The + * callback function runs as an agent of the CURL processing, using the + * curl_usrptr object, and can assume that the CURL implementation (cxip_curl.c) + * will do all CURL memory cleanup, regardless of success or failure. This means + * that the callback can simply re-issue the same command as if for the first + * time to perform a retry on any kind of busy error. + * + * To prevent endless retries, the elapsing time must be recorded in the + * action object (so that it will persist across multiple CURL operations). + */ + +/** + * Perform a CURL request to delete a multicast address. * - * This sets jstate->finished_mcast, even if the operation fails. - * This sets jstate->bcast_data.valid if the address is valid. + * This is the last thing done after closing down the mc_object in libfabric, so + * all that remains is to remove the actual multicast in the FM and delete + * allocated memory for mc_obj. If the CURL operation cannot complete + * successfully, the multicast delete will occur at the end of the job. */ -static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) +static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj) { - struct cxip_curl_mcast_usrptr *curl_usrptr = handle->usrptr; - struct cxip_join_state *jstate = curl_usrptr->jstate; + struct cxip_curl_mcast_delete_usrptr *curl_usrptr; + char *url; + int ret; + + /* early exit will attempt to free these */ + curl_usrptr = NULL; + url = NULL; + + TRACE_JOIN("deleting multicast address via REST\n"); + ret = asprintf(&url, "%s/%d", cxip_env.coll_fabric_mgr_url, + mc_obj->mcast_addr); + if (ret < 0) { + TRACE_JOIN("Failed to construct CURL address\n"); + goto quit; + } + /* create the return pointer */ + curl_usrptr = calloc(1, sizeof(*curl_usrptr)); + if (!curl_usrptr) { + TRACE_JOIN("curl_usrptr calloc() error\n"); + ret = -FI_ENOMEM; + goto quit; + } + curl_usrptr->mc_obj = mc_obj; + ret = cxip_curl_perform(url, NULL, cxip_env.coll_mcast_token, 0, + CURL_DELETE, false, _cxip_delete_mcast_cb, + curl_usrptr); + if (ret < 0) { + TRACE_JOIN("CURL delete mcast %d dispatch failed %d\n", + mc_obj->mcast_addr, ret); + goto quit; + } + TRACE_JOIN("CURL delete mcast %d dispatch successful\n", + mc_obj->mcast_addr); +quit: + free(url); + if (ret < 0) { + TRACE_JOIN("CURL delete mcast %d failed\n", + mc_obj->mcast_addr); + free(curl_usrptr); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = ret; + } + } +} + +static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle) +{ + struct cxip_curl_mcast_delete_usrptr *curl_usrptr = handle->usrptr; + struct cxip_coll_mc *mc_obj = curl_usrptr->mc_obj; struct json_object *json_obj; - struct cxip_addr caddr; - const char *hwrootstr; - int mcaddr, hwroot; - uint32_t octet[6], n; - int i, ret; + const char *errmsg = ""; - /* Creation process is done */ - TRACE_CURL("CURL COMPLETED!\n"); - jstate->finished_mcast = true; + /* note: allocates space for strings, free at end */ + json_obj = json_tokener_parse(handle->response); + if (json_obj) { + if (cxip_json_string("message", json_obj, &errmsg)) + errmsg = ""; + } else { + TRACE_JOIN("callback: malformed server response: '%s'\n", + handle->response); + } switch (handle->status) { case 200: case 201: - /* CURL succeeded, parse response */ - TRACE_CURL("CURL PARSE RESPONSE:\n%s\n", handle->response); - if (!(json_obj = json_tokener_parse(handle->response))) - break; - if (cxip_json_int("mcastID", json_obj, &mcaddr)) - break; - if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) - break; - - memset(octet, 0, sizeof(octet)); - hwroot = 0; - n = sscanf(hwrootstr, "%x:%x:%x:%x:%x:%x", - &octet[5], &octet[4], &octet[3], - &octet[2], &octet[1], &octet[0]); - if (n < 3) { - TRACE_CURL("bad hwroot address = %s\n", hwrootstr); - break; - } - for (i = 0; i < n; i++) - hwroot |= octet[i] << (8*i); - - TRACE_CURL("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr, - hwroot); - for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { - ret = cxip_av_lookup_addr( - jstate->av_set_obj->cxi_av, - jstate->av_set_obj->fi_addr_ary[i], - &caddr); - if (ret < 0) - continue; - TRACE_JOIN("test %d == %d\n", hwroot, caddr.nic); - if (hwroot == caddr.nic) - break; + TRACE_JOIN("callback: %ld SUCCESS MCAST DELETED\n", + handle->status); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_SUCCESS; } - TRACE_CURL("final index=%d\n", i); - if (i >= jstate->av_set_obj->fi_addr_cnt) { - TRACE_CURL("multicast HWroot not found in av_set\n"); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + break; + case 409: + TRACE_JOIN("callback: delete mcast failed: %ld '%s'\n", + handle->status, errmsg); + + if (_tsexp(&mc_obj->curlexpires)) { + TRACE_JOIN("callback: FM expired\n"); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_CXI_ERRNO_JOIN_CURL_TIMEOUT; + } break; } - /* Production MCAST address */ - jstate->bcast_data.valid = true; - jstate->bcast_data.hwroot_idx = i; - jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; - jstate->is_mcast = true; - /* This succeeded */ - TRACE_CURL("curl: mcaddr =%08x\n", - jstate->bcast_data.mcast_addr); - TRACE_CURL("curl: hwrootidx=%d\n", - jstate->bcast_data.hwroot_idx); + /* try again */ + _curl_delete_mc_obj(mc_obj); break; default: - TRACE_CURL("ERRMSK SET CURL error %ld!\n", handle->status); - if (handle->response) - TRACE_CURL("ERROR RESPONSE:\n%s\n", handle->response); - // TODO finer error differentiation from CURL errors - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; + TRACE_JOIN("callback: %ld unknown status\n", handle->status); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_CXI_ERRNO_JOIN_CURL_FAILED; + } break; } + /* free json memory */ + json_object_put(json_obj); free(curl_usrptr); - TRACE_CURL("CURL COMPLETED!\n"); - jstate->finished_mcast = true; } /** - * Start a CURL request for a multicast address. + * Perform a CURL request to create a new multicast address. */ -static void _start_curl(void *ptr) +static void _create_mcast_addr(struct cxip_join_state *jstate) { - struct cxip_curl_mcast_usrptr *curl_usrptr; - struct cxip_join_state *jstate = ptr; + struct cxip_curl_mcast_create_usrptr *curl_usrptr; struct cxip_addr caddr; - char *jsonreq, *mac, *url, *p; + char *jsonreq, *mac, *url, *tok, *p; int i, ret; - /* early exit will attempt to free these */ + /* all exit paths attempt to free these */ curl_usrptr = NULL; jsonreq = NULL; mac = NULL; url = NULL; - - /* acquire the environment variables needed */ - TRACE_CURL("jobid = %s\n", cxip_env.coll_job_id); - TRACE_CURL("stepid = %s\n", cxip_env.coll_job_step_id); - TRACE_CURL("fmurl = %s\n", cxip_env.coll_fabric_mgr_url); - TRACE_CURL("token = %s\n", cxip_env.coll_mcast_token); - TRACE_CURL("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); - TRACE_CURL("minnodes= %ld\n", cxip_env.hwcoll_min_nodes); - TRACE_CURL("retry = %ld\n", cxip_env.coll_retry_usec); - TRACE_CURL("tmout = %ld\n", cxip_env.coll_timeout_usec); + tok = NULL; + + /* check the environment variables needed */ + TRACE_JOIN("ENV jobid = %s\n", cxip_env.coll_job_id); + TRACE_JOIN("ENV stepid = %s\n", cxip_env.coll_job_step_id); + TRACE_JOIN("ENV fmurl = %s\n", cxip_env.coll_fabric_mgr_url); + TRACE_JOIN("ENV token = %s\n", cxip_env.coll_mcast_token); + TRACE_JOIN("ENV maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); + TRACE_JOIN("ENV minnodes= %ld\n", cxip_env.hwcoll_min_nodes); + TRACE_JOIN("ENV retry = %ld\n", cxip_env.coll_retry_usec); + TRACE_JOIN("ENV tmout = %ld\n", cxip_env.coll_timeout_usec); + TRACE_JOIN("ENV fmtmout = %ld\n", cxip_env.coll_fm_timeout_msec); /* Generic error for any preliminary failures */ - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; - if (!cxip_env.coll_job_id || - !cxip_env.coll_fabric_mgr_url || - !cxip_env.coll_mcast_token) { - TRACE_JOIN("Check environment variables\n"); + ret = 0; + if (!cxip_env.coll_job_id) { + TRACE_JOIN("missing job id\n"); + ret = -FI_EINVAL; + } + if (!cxip_env.coll_fabric_mgr_url) { + TRACE_JOIN("missing FM url\n"); + ret = -FI_EINVAL; + } + if (!cxip_env.coll_mcast_token) { + TRACE_JOIN("missing FM token\n"); ret = -FI_EINVAL; - goto quit; } + if (ret < 0) + goto quit; - ret = asprintf(&url, "%s", cxip_env.coll_fabric_mgr_url); + if (cxip_trap_search(0, CXIP_TRAP_CURL_FM_URL, NULL, NULL)) + ret = asprintf(&url, "%s-bad", cxip_env.coll_fabric_mgr_url); + else + ret = asprintf(&url, "%s", cxip_env.coll_fabric_mgr_url); if (ret < 0) { - TRACE_JOIN("Failed to construct CURL address\n"); + TRACE_JOIN("failed to construct CURL address\n"); + ret = -FI_ENOMEM; + goto quit; + } + TRACE_JOIN("final fmurl = %s\n", url); + if (cxip_trap_search(0, CXIP_TRAP_CURL_TOKEN, NULL, NULL)) + ret = asprintf(&tok, "%s-bad", cxip_env.coll_mcast_token); + else + ret = asprintf(&tok, "%s", cxip_env.coll_mcast_token); + if (ret < 0) { + TRACE_JOIN("failed to construct CURL token\n"); ret = -FI_ENOMEM; goto quit; } + TRACE_JOIN("final token = %s\n", tok); /* five hex digits per mac, two colons, two quotes, comma */ p = mac = malloc(10*jstate->av_set_obj->fi_addr_cnt + 1); if (!mac) { - TRACE_JOIN("Failed to allocate mac list\n"); + TRACE_JOIN("failed to allocate mac list\n"); ret = -FI_ENOMEM; goto quit; } @@ -3008,38 +3287,195 @@ static void _start_curl(void *ptr) cxip_env.coll_job_id, cxip_env.coll_job_step_id); if (ret < 0) { - TRACE_JOIN("Creating JSON request = %d\n", ret); + TRACE_JOIN("failed to create jsonreq= %d\n", ret); ret = -FI_ENOMEM; goto quit; } single_to_double_quote(jsonreq); - TRACE_JOIN("JSON = %s\n", jsonreq); - /* create the mcast address */ + /* create the user return pointer */ curl_usrptr = calloc(1, sizeof(*curl_usrptr)); if (!curl_usrptr) { - TRACE_JOIN("curl_usrptr calloc() error\n"); + TRACE_JOIN("failed to calloc() curl_usrptr\n"); ret = -FI_ENOMEM; goto quit; } /* dispatch CURL request */ curl_usrptr->jstate = jstate; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_CURLSND, &ret)) + ret = cxip_curl_perform(url, jsonreq, tok, 0, CURL_POST, false, + _cxip_create_mcast_cb, curl_usrptr); + if (ret < 0) { + TRACE_JOIN("CURL create mcast dispatch failed %d\n", ret); goto quit; - ret = cxip_curl_perform(url, jsonreq, cxip_env.coll_mcast_token, 0, - CURL_POST, false, _cxip_create_mcast_cb, - curl_usrptr); + } + TRACE_JOIN("CURL create mcast dispatch successful\n"); quit: + free(tok); free(url); free(mac); free(jsonreq); if (ret < 0) { - TRACE_JOIN("CURL execution failed\n"); + TRACE_JOIN("CURL create mcast failed\n"); free(curl_usrptr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_CURL_FAILED; jstate->finished_mcast = true; } } +static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) +{ + struct cxip_curl_mcast_create_usrptr *curl_usrptr = handle->usrptr; + struct cxip_join_state *jstate = curl_usrptr->jstate; + struct json_object *json_obj; + struct cxip_addr caddr; + const char *hwrootstr = ""; + const char *message = ""; + const char *cptr; + int mcaddr = -1; + int hwroot = -1; + int curl_errcode = 0; + uint32_t octet[6], n; + int i, ret; + + /* note: allocates space for strings, free at end */ + json_obj = json_tokener_parse(handle->response); + if (json_obj) { + if (cxip_json_string("message", json_obj, &message)) + message = ""; + if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) + hwrootstr = ""; + if (cxip_json_int("mcastID", json_obj, &mcaddr)) + mcaddr = -1; + } else { + TRACE_JOIN("callback: malformed server response: '%s'\n", + handle->response); + } + TRACE_JOIN("%s status =%ld\n", __func__, handle->status); + TRACE_JOIN("%s response ='%s'\n", __func__, handle->response); + TRACE_JOIN("%s message ='%s'\n", __func__, message); + TRACE_JOIN("%s hwrootstr='%s'\n", __func__, hwrootstr); + TRACE_JOIN("%s mcaddr ='%d'\n", __func__, mcaddr); + + /* Process result */ + switch (handle->status) { + case 200: + case 201: + if (mcaddr < 0 || mcaddr >= 8192) { + TRACE_JOIN("callback: mcaddr=%d is invalid\n", mcaddr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INVALID; + jstate->finished_mcast = true; + break; + } + memset(octet, 0, sizeof(octet)); + hwroot = 0; + n = 0; + if (hwrootstr) + n = sscanf(hwrootstr, "%x:%x:%x:%x:%x:%x", + &octet[5], &octet[4], &octet[3], + &octet[2], &octet[1], &octet[0]); + if (n < 3) { + TRACE_JOIN("callback: hwroot '%s' too few octets\n", + hwrootstr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; + jstate->finished_mcast = true; + break; + } + for (i = 0; i < n; i++) + hwroot |= octet[i] << (8*i); + + for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { + ret = cxip_av_lookup_addr( + jstate->av_set_obj->cxi_av, + jstate->av_set_obj->fi_addr_ary[i], + &caddr); + if (ret < 0) + continue; + if (hwroot == caddr.nic) + break; + } + if (i >= jstate->av_set_obj->fi_addr_cnt) { + TRACE_JOIN("callback: hwroot rank invalid\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; + jstate->finished_mcast = true; + break; + } + /* Production MCAST address */ + jstate->bcast_data.valid = true; + jstate->bcast_data.hwroot_idx = i; + jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; + jstate->is_mcast = true; + /* This succeeded */ + TRACE_JOIN("callback: SUCCESS mcaddr=%d hwroot=%d\n", + jstate->bcast_data.mcast_addr, + jstate->bcast_data.hwroot_idx); + jstate->prov_errno = 0; + jstate->finished_mcast = true; + break; + case 400: + TRACE_JOIN("callback: create mcast failed: %ld '%s'\n", + handle->status, message ? message : ""); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + jstate->finished_mcast = true; + break; + case 409: + TRACE_JOIN("callback: create mcast failed: %ld '%s'\n", + handle->status, message); + + if (_tsexp(&jstate->curlexpires)) { + TRACE_JOIN("callback: FM expired\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_CURL_TIMEOUT; + jstate->finished_mcast = true; + break; + } + /* retry */ + _create_mcast_addr(jstate); + break; + case 507: + /* find and parse error instance number */ + cptr = message; + curl_errcode = 0; + while (cptr && *cptr != ':') + cptr++; + if (*cptr == ':') { + cptr -= 2; + sscanf(cptr, "%02d:", &curl_errcode); + TRACE_JOIN("error code = %d\n", curl_errcode); + } + switch (curl_errcode) { + case 1: + TRACE_JOIN("failed: no mcast, exceeded job limit\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; + break; + case 2: + TRACE_JOIN("failed: no mcast, no addresses left\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; + break; + case 3: + TRACE_JOIN("failed: no hwroot available in group\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INUSE; + break; + default: + TRACE_JOIN("failed: errcode=%d\n", curl_errcode); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + break; + } + jstate->finished_mcast = true; + break; + default: + TRACE_JOIN("callback: unhandled CURL error %ld '%s'\n", + handle->status, message ? message : ""); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + jstate->finished_mcast = true; + TRACE_JOIN("jstate->prov_errno = %d\n", jstate->prov_errno); + break; + } + TRACE_JOIN("jstate->prov_errno = %d\n", jstate->prov_errno); + /* free json memory */ + json_object_put(json_obj); + free(curl_usrptr); +} + + /**************************************************************************** * State machine for performing fi_join_collective() * @@ -3115,7 +3551,8 @@ static void _start_getgroup(void *ptr) TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx); - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error)) + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error, + &jstate->prov_errno)) goto quit; /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ zb->error = cxip_zbcoll_getgroup(zb); @@ -3137,6 +3574,10 @@ static void _finish_getgroup(void *ptr) /* Create a multicast address and broadcast it to all endpoints. * If jstate->create_mcast is set, this will use CURL to get an address. * Otherwise, this presumes static initialization, and sets bcast_data.valid. + * + * Caution: re-entrant routine. + * This routine is called repeatedly by rank 0, returning -FI_EAGAIN to drive + * the CURL state. See the branch to 'quit' below. */ static void _start_bcast(void *ptr) { @@ -3146,8 +3587,6 @@ static void _start_bcast(void *ptr) if (!suppress_busy_log) TRACE_JOIN("%s: entry\n", __func__); - /* error will indicate that the multicast request fails */ - jstate->prov_errno = C_RC_INVALID_DFA_FORMAT; /* rank 0 always does the work here */ if (jstate->mynode_idx == 0) { if (!suppress_busy_log) @@ -3155,9 +3594,16 @@ static void _start_bcast(void *ptr) if (jstate->create_mcast) { /* first call (only) initiates CURL request */ if (!jstate->creating_mcast) { + struct timespec expires = { + cxip_env.coll_fm_timeout_msec/1000, + (cxip_env.coll_fm_timeout_msec%1000)*1000000}; + TRACE_JOIN("%s create mcast\n", __func__); jstate->creating_mcast = true; - _start_curl(jstate); + + _tsset(&jstate->curlexpires, &expires); + _create_mcast_addr(jstate); + TRACE_JOIN("%s create mcast initiated\n", __func__); } /* every retry call checks to see if CURL is complete */ if (!jstate->finished_mcast) { @@ -3165,16 +3611,17 @@ static void _start_bcast(void *ptr) suppress_busy_log++; goto quit; } + TRACE_JOIN("%s create mcast completed\n", __func__); suppress_busy_log = 0; /* bcast_data.valid is set by curl callback */ } else { /* static bcast data is presumed correct */ + TRACE_JOIN("%s static multicast accepted\n", __func__); jstate->bcast_data.valid = true; } } - /* speculative prov_errno for trap */ - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error)) + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error, + &jstate->prov_errno)) goto quit; /* rank > 0 endpoints overwritten by rank = 0 data */ /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ @@ -3192,10 +3639,13 @@ static void _finish_bcast(void *ptr) bool is_hwroot; int ret; - TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", __func__, + TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", + __func__, jstate->bcast_data.mcast_addr, jstate->bcast_data.hwroot_idx, jstate->bcast_data.valid); + TRACE_JOIN("%s: jstate->prov_errno %d\n", __func__, + jstate->prov_errno); /* all NICs now have same mc_addr data, if invalid, fail */ /* jstate->prov_errno is presumed set if not valid */ if (!jstate->bcast_data.valid) @@ -3207,7 +3657,7 @@ static void _finish_bcast(void *ptr) if (jstate->bcast_data.hwroot_idx >= jstate->av_set_obj->fi_addr_cnt) { TRACE_JOIN("%s: reject invalid hwroot_idx\n", __func__); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; ret = -FI_EINVAL; goto quit; } @@ -3216,7 +3666,7 @@ static void _finish_bcast(void *ptr) is_hwroot = (jstate->bcast_data.hwroot_idx == jstate->mynode_idx); if (is_hwroot && jstate->ep_obj->coll.is_hwroot) { TRACE_JOIN("%s: reject join, hwroot in use\n", __func__); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INUSE; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INUSE; ret = -FI_EINVAL; goto quit; @@ -3228,15 +3678,16 @@ static void _finish_bcast(void *ptr) jstate->bcast_data.mcast_addr)) { TRACE_JOIN("%s: reject join, mcast %d in use\n", __func__, jstate->bcast_data.mcast_addr); - jstate->prov_errno = CXIP_PROV_ERRNO_MCAST_INUSE; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; ret = -FI_EINVAL; goto quit; } - /* speculative prov_errno for trap */ - jstate->prov_errno = CXIP_PROV_ERRNO_PTE; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret)) + jstate->prov_errno = 0; + + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret, + &jstate->prov_errno)) goto quit; - TRACE_JOIN("%s: continuing to configure\n", __func__); + /* all endpoints initialize with same mcast addr and hwroot */ ret = _initialize_mc(jstate); quit: /* if initialization fails, invalidate bcast_data */ @@ -3253,11 +3704,13 @@ static void _start_reduce(void *ptr) struct cxip_join_state *jstate = ptr; struct cxip_zbcoll_obj *zb = jstate->zb; - /* reduce ANDs inverted bcast_data, if any invalid, all become invalid */ - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error)) + /* Create an error bitmask from the prov_errno */ + _proverr_to_bits(jstate); + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error, + &jstate->prov_errno)) goto quit; /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ - zb->error = cxip_zbcoll_reduce(zb, &jstate->bcast_data.uint64); + zb->error = cxip_zbcoll_reduce(zb, &jstate->reduce_err.uint64); quit: if (zb->error) _append_sched(zb, jstate); @@ -3293,10 +3746,10 @@ static void _start_cleanup(void *ptr) &jstate->mc_obj->mc_fid.fid : NULL; entry.context = jstate->context; - if (jstate->prov_errno != CXIP_PROV_ERRNO_OK) { + if (jstate->prov_errno >= FI_CXI_ERRNO_JOIN_FIRST) { size = sizeof(struct fi_eq_err_entry); entry.data = FI_JOIN_COMPLETE; - entry.err = -FI_EAVAIL; + entry.err = -FI_ECONNREFUSED; entry.prov_errno = jstate->prov_errno; flags |= UTIL_FLAG_ERROR; } @@ -3756,6 +4209,18 @@ void cxip_coll_reset_mc_ctrs(struct fid_mc *mc) ofi_atomic_set32(&mc_obj->tmout_cnt, 0); } +void cxip_coll_get_mc_ctrs(struct fid_mc *mc, struct coll_counters *counters) +{ + struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc; + + counters->coll_recv_cnt = ofi_atomic_get32(&mc_obj->coll_pte->recv_cnt); + counters->send_cnt = ofi_atomic_get32(&mc_obj->send_cnt); + counters->recv_cnt = ofi_atomic_get32(&mc_obj->recv_cnt); + counters->pkt_cnt = ofi_atomic_get32(&mc_obj->pkt_cnt); + counters->seq_err_cnt = ofi_atomic_get32(&mc_obj->seq_err_cnt); + counters->tmout_cnt = ofi_atomic_get32(&mc_obj->tmout_cnt); +} + /**************************************************************************** * Manage the static coll structure in the EP. Because of its specialized * nature, it made sense to manage it here, rather than in the EP module. @@ -3788,7 +4253,16 @@ struct fi_ops_collective cxip_collective_no_ops = { .msg = fi_coll_no_msg, }; -/* Close collectives - call during EP close, ep_obj->lock is held */ +/* Close collectives - called during EP close, ep_obj->lock is held. + * This does not issue CURL requests to delete multicast addresses. + * + * This is called as part of an endpoint shutdown, which is part of an + * application shutdown, and the SLURM cleanup handler will destroy all + * multicast addresses with an efficient method that deletes all per-job + * addresses. The concern is that if there is a large count of multicast + * addresses, deleting them individually in this code will create a delay, + * and could clog the REST API. + */ void cxip_coll_close(struct cxip_ep_obj *ep_obj) { struct cxip_coll_mc *mc_obj; @@ -3796,7 +4270,7 @@ void cxip_coll_close(struct cxip_ep_obj *ep_obj) while (!dlist_empty(&ep_obj->coll.mc_list)) { dlist_pop_front(&ep_obj->coll.mc_list, struct cxip_coll_mc, mc_obj, entry); - _close_mc(mc_obj); + _close_mc(mc_obj, false, true); } } @@ -3867,6 +4341,7 @@ int cxip_coll_enable(struct cxip_ep *ep) ep->ep.collective = &cxip_collective_ops; ep_obj->coll.enabled = true; + cxip_coll_init_metrics(); cxip_coll_trace_init(); return FI_SUCCESS; } diff --git a/prov/cxi/src/cxip_coll_trace.c b/prov/cxi/src/cxip_coll_trace.c index 276fa83498e..05bb10a2630 100644 --- a/prov/cxi/src/cxip_coll_trace.c +++ b/prov/cxi/src/cxip_coll_trace.c @@ -1,6 +1,7 @@ /* - * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ /** @@ -106,7 +107,9 @@ char *cxip_coll_trace_pathname; FILE *cxip_coll_trace_fid; uint64_t cxip_coll_trace_mask; -/* Get environment variable as string representation of int */ +/* Get environment variable as string representation of int + * Return -1 if undefined, or not-a-number. + */ static int getenv_int(const char *name) { char *env; @@ -119,6 +122,21 @@ static int getenv_int(const char *name) return value; } +/* Get environment variable + * Return 0 if undefined, or defined as zero. + */ +static int getenv_is_set(const char *name) +{ + char *env; + + env = getenv(name); + if (!env) + return 0; + if (strcmp(env, "0") == 0) + return 0; + return 1; +} + void cxip_coll_trace_init(void) { const char *fpath; @@ -139,19 +157,19 @@ void cxip_coll_trace_init(void) fpath = getenv("CXIP_TRC_PATHNAME"); /* set bits in cxip_coll_trace_mask */ - if (getenv("CXIP_TRC_CTRL")) + if (getenv_is_set("CXIP_TRC_CTRL")) cxip_coll_trace_set(CXIP_TRC_CTRL); - if (getenv("CXIP_TRC_ZBCOLL")) + if (getenv_is_set("CXIP_TRC_ZBCOLL")) cxip_coll_trace_set(CXIP_TRC_ZBCOLL); - if (getenv("CXIP_TRC_COLL_CURL")) + if (getenv_is_set("CXIP_TRC_COLL_CURL")) cxip_coll_trace_set(CXIP_TRC_COLL_CURL); - if (getenv("CXIP_TRC_COLL_PKT")) + if (getenv_is_set("CXIP_TRC_COLL_PKT")) cxip_coll_trace_set(CXIP_TRC_COLL_PKT); - if (getenv("CXIP_TRC_COLL_JOIN")) + if (getenv_is_set("CXIP_TRC_COLL_JOIN")) cxip_coll_trace_set(CXIP_TRC_COLL_JOIN); - if (getenv("CXIP_TRC_COLL_DEBUG")) + if (getenv_is_set("CXIP_TRC_COLL_DEBUG")) cxip_coll_trace_set(CXIP_TRC_COLL_DEBUG); - if (getenv("CXIP_TRC_TEST_CODE")) + if (getenv_is_set("CXIP_TRC_TEST_CODE")) cxip_coll_trace_set(CXIP_TRC_TEST_CODE); /* if no trace masks set, do nothing */ diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index 675d91eeb56..a4613c66149 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -34,9 +34,9 @@ int cxip_cq_req_complete(struct cxip_req *req) return FI_SUCCESS; } - return ofi_cq_write(&req->cq->util_cq, (void *)req->context, - req->flags, req->data_len, (void *)req->buf, - req->data, req->tag); + return ofi_peer_cq_write(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag, FI_ADDR_NOTAVAIL); } /* @@ -50,9 +50,9 @@ int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src) return FI_SUCCESS; } - return ofi_cq_write_src(&req->cq->util_cq, (void *)req->context, - req->flags, req->data_len, (void *)req->buf, - req->data, req->tag, src); + return ofi_peer_cq_write(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag, src); } /* @@ -94,7 +94,7 @@ int cxip_cq_req_error(struct cxip_req *req, size_t olen, err_entry.buf = (void *)(uintptr_t)req->buf; err_entry.src_addr = src_addr; - return ofi_cq_write_error(&req->cq->util_cq, &err_entry); + return ofi_peer_cq_write_error(&req->cq->util_cq, &err_entry); } /* @@ -125,64 +125,92 @@ void cxip_util_cq_progress(struct util_cq *util_cq) ofi_genlock_unlock(&cq->ep_list_lock); } +/* common function for both eq and cq strerror function */ +const char *cxip_strerror(int prov_errno) +{ + /* both CXI driver error and collective errors share this function */ + if (prov_errno < FI_CXI_ERRNO_RED_FIRST) + return cxi_rc_to_str(prov_errno); + + switch (prov_errno) { + /* EQ JOIN error codes */ + case FI_CXI_ERRNO_JOIN_MCAST_INUSE: + return "coll join multicast address in-use"; + case FI_CXI_ERRNO_JOIN_HWROOT_INUSE: + return "coll join hwroot in-use"; + case FI_CXI_ERRNO_JOIN_MCAST_INVALID: + return "coll join multicast address invalid"; + case FI_CXI_ERRNO_JOIN_HWROOT_INVALID: + return "coll join hwroot invalid"; + case FI_CXI_ERRNO_JOIN_CURL_FAILED: + return "coll join FM REST CURL failed"; + case FI_CXI_ERRNO_JOIN_CURL_TIMEOUT: + return "coll join FM REST CURL timed out"; + case FI_CXI_ERRNO_JOIN_FAIL_PTE: + return "coll join PTE setup failed"; + case FI_CXI_ERRNO_JOIN_OTHER: + return "coll join unknown error"; + + /* CQ REDUCE error codes */ + case FI_CXI_ERRNO_RED_FLT_OVERFLOW: + return "coll reduce FLT overflow"; + case FI_CXI_ERRNO_RED_FLT_INVALID: + return "coll reduce FLT invalid"; + case FI_CXI_ERRNO_RED_INT_OVERFLOW: + return "coll reduce INT overflow"; + case FI_CXI_ERRNO_RED_CONTR_OVERFLOW: + return "coll reduce contribution overflow"; + case FI_CXI_ERRNO_RED_OP_MISMATCH: + return "coll reduce opcode mismatch"; + case FI_CXI_ERRNO_RED_MC_FAILURE: + return "coll reduce multicast timeout"; + + /* Unknown error */ + default: + return "coll unspecified error"; + } +} + /* * cxip_cq_strerror() - Converts provider specific error information into a * printable string. */ static const char *cxip_cq_strerror(struct fid_cq *cq, int prov_errno, - const void *err_data, char *buf, - size_t len) + const void *err_data, char *buf, size_t len) { - switch (prov_errno) { - case CXIP_PROV_ERRNO_OK: - return "CXIP_COLL_OK"; - case CXIP_PROV_ERRNO_PTE: - return "CXIP_COLL_PTE_ERROR"; - case CXIP_PROV_ERRNO_MCAST_INUSE: - return "CXIP_COLL_MCAST_IN_USE"; - case CXIP_PROV_ERRNO_HWROOT_INUSE: - return "CXIP_COLL_HWROOT_IN_USE"; - case CXIP_PROV_ERRNO_MCAST_INVALID: - return "CXIP_COLL_MCAST_INVALID"; - case CXIP_PROV_ERRNO_HWROOT_INVALID: - return "CXIP_COLL_HWROOT_INVALID"; - case CXIP_PROV_ERRNO_CURL: - return "CXIP_COLL_CURL_ERROR"; - } - return cxi_rc_to_str(prov_errno); + const char *errmsg = cxip_strerror(prov_errno); + if (buf && len > 0) + strncpy(buf, errmsg, len); + return errmsg; } -/* - * cxip_cq_trywait - Return success if able to block waiting for CQ events. - */ -static int cxip_cq_trywait(void *arg) +int cxip_cq_trywait(struct cxip_cq *cq) { - struct cxip_cq *cq = (struct cxip_cq *)arg; struct fid_list_entry *fid_entry; struct dlist_entry *item; + struct cxip_ep *ep; - assert(cq->util_cq.wait); - - if (!cq->priv_wait) { + if (cq->ep_fd < 0) { CXIP_WARN("No CXI wait object\n"); return -FI_EINVAL; } + ofi_genlock_lock(&cq->util_cq.cq_lock); + if (!ofi_cirque_isempty(cq->util_cq.cirq)) { + ofi_genlock_unlock(&cq->util_cq.cq_lock); + return -FI_EAGAIN; + } + ofi_genlock_unlock(&cq->util_cq.cq_lock); + ofi_genlock_lock(&cq->ep_list_lock); dlist_foreach(&cq->util_cq.ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); - if (cxip_ep_peek(fid_entry->fid)) { - ofi_genlock_unlock(&cq->ep_list_lock); + ep = container_of(fid_entry->fid, struct cxip_ep, ep.fid); - return -FI_EAGAIN; - } - } + if (!ep->ep_obj->priv_wait) + continue; - /* Clear wait, and check for any events */ - cxil_clear_wait_obj(cq->priv_wait); - dlist_foreach(&cq->util_cq.ep_list, item) { - fid_entry = container_of(item, struct fid_list_entry, entry); - if (cxip_ep_peek(fid_entry->fid)) { + if (cxip_ep_trywait(ep->ep_obj, cq)) { ofi_genlock_unlock(&cq->ep_list_lock); return -FI_EAGAIN; @@ -224,21 +252,12 @@ static int cxip_cq_close(struct fid *fid) { struct cxip_cq *cq = container_of(fid, struct cxip_cq, util_cq.cq_fid.fid); - int ret; if (ofi_atomic_get32(&cq->util_cq.ref)) return -FI_EBUSY; - if (cq->priv_wait) { - ret = ofi_wait_del_fd(cq->util_cq.wait, - cxil_get_wait_obj_fd(cq->priv_wait)); - if (ret) - CXIP_WARN("Wait FD delete error: %d\n", ret); - - ret = cxil_destroy_wait_obj(cq->priv_wait); - if (ret) - CXIP_WARN("Release CXI wait object failed: %d\n", ret); - } + if (cq->ep_fd >= 0) + close(cq->ep_fd); ofi_cq_cleanup(&cq->util_cq); ofi_genlock_destroy(&cq->ep_list_lock); @@ -249,14 +268,116 @@ static int cxip_cq_close(struct fid *fid) return 0; } +static int cxip_cq_signal(struct fid_cq *cq_fid) +{ + return -FI_ENOSYS; +} + +static int cxip_cq_control(fid_t fid, int command, void *arg) +{ + struct cxip_cq *cq = container_of(fid, struct cxip_cq, util_cq.cq_fid); + struct fi_wait_pollfd *pollfd; + int ret; + + switch (command) { + case FI_GETWAIT: + if (cq->ep_fd < 0) { + ret = -FI_ENODATA; + break; + } + if (cq->attr.wait_obj == FI_WAIT_FD) { + *(int *) arg = cq->ep_fd; + return FI_SUCCESS; + } + + pollfd = arg; + if (pollfd->nfds >= 1) { + pollfd->fd[0].fd = cq->ep_fd; + pollfd->fd[0].events = POLLIN; + pollfd->nfds = 1; + + ret = FI_SUCCESS; + } else { + ret = -FI_ETOOSMALL; + } + break; + case FI_GETWAITOBJ: + *(enum fi_wait_obj *) arg = cq->attr.wait_obj; + ret = FI_SUCCESS; + break; + default: + ret = -FI_ENOSYS; + break; + } + + return ret; +} + +static ssize_t cxip_cq_sreadfrom(struct fid_cq *cq_fid, void *buf, + size_t count, fi_addr_t *src_addr, + const void *cond, int timeout) +{ + struct cxip_cq *cq = container_of(cq_fid, struct cxip_cq, + util_cq.cq_fid); + struct epoll_event ev; + uint64_t endtime; + ssize_t ret; + + if (!cq->attr.wait_obj) + return -FI_EINVAL; + + endtime = ofi_timeout_time(timeout); + + do { + ret = fi_cq_readfrom(cq_fid, buf, count, src_addr); + if (ret != -FI_EAGAIN) + break; + + if (ofi_adjust_timeout(endtime, &timeout)) + return -FI_EAGAIN; + + ret = cxip_cq_trywait(cq); + if (ret == -FI_EAGAIN) { + ret = 0; + continue; + } + assert(ret == FI_SUCCESS); + + memset(&ev, 0, sizeof(ev)); + ret = epoll_wait(cq->ep_fd, &ev, 1, timeout); + if (ret > 0) + ret = 0; + + } while (!ret); + + return ret == -FI_ETIMEDOUT ? -FI_EAGAIN : ret; +} + +static ssize_t cxip_cq_sread(struct fid_cq *cq_fid, void *buf, size_t count, + const void *cond, int timeout) +{ + return cxip_cq_sreadfrom(cq_fid, buf, count, NULL, cond, timeout); +} + static struct fi_ops cxip_cq_fi_ops = { .size = sizeof(struct fi_ops), .close = cxip_cq_close, .bind = fi_no_bind, - .control = ofi_cq_control, + .control = cxip_cq_control, .ops_open = fi_no_ops_open, }; +static struct fi_ops_cq cxip_cq_ops = { + .size = sizeof(struct fi_ops_cq), + .read = ofi_cq_read, + .readfrom = ofi_cq_readfrom, + .readerr = ofi_cq_readerr, + .sread = cxip_cq_sread, + .sreadfrom = cxip_cq_sreadfrom, + .signal = cxip_cq_signal, + .strerror = cxip_cq_strerror, +}; + static struct fi_cq_attr cxip_cq_def_attr = { .flags = 0, .format = FI_CQ_FORMAT_CONTEXT, @@ -316,50 +437,35 @@ static int cxip_cq_verify_attr(struct fi_cq_attr *attr) return FI_SUCCESS; } -/* - * cxip_cq_alloc_priv_wait - Allocate an internal wait channel for the CQ. - */ -static int cxip_cq_alloc_priv_wait(struct cxip_cq *cq) +/* EP adds wait FD to the CQ epoll FD */ +int cxip_cq_add_wait_fd(struct cxip_cq *cq, int wait_fd, int events) { + struct epoll_event ev = { + .events = events, + }; int ret; - int wait_fd; - - assert(cq->domain); - - /* Not required or already created */ - if (!cq->util_cq.wait || cq->priv_wait) - return FI_SUCCESS; - - ret = cxil_alloc_wait_obj(cq->domain->lni->lni, &cq->priv_wait); - if (ret) { - CXIP_WARN("Allocation of internal wait object failed %d\n", - ret); - return ret; - } - wait_fd = cxil_get_wait_obj_fd(cq->priv_wait); - ret = fi_fd_nonblock(wait_fd); - if (ret) { - CXIP_WARN("Unable to set CQ wait non-blocking mode: %d\n", ret); - goto destroy_wait; - } + ret = epoll_ctl(cq->ep_fd, EPOLL_CTL_ADD, wait_fd, &ev); + if (ret < 0) { + ret = errno; + CXIP_WARN("EP wait FD add to CQ failed %d\n", ret); - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN, - cxip_cq_trywait, cq, &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("Add FD of internal wait object failed: %d\n", ret); - goto destroy_wait; + return -FI_EINVAL; } - CXIP_DBG("Add CQ private wait object, CQ intr FD: %d\n", wait_fd); - return FI_SUCCESS; +} -destroy_wait: - cxil_destroy_wait_obj(cq->priv_wait); - cq->priv_wait = NULL; +/* EP deletes wait FD from the CQ epoll FD */ +void cxip_cq_del_wait_fd(struct cxip_cq *cq, int wait_fd) +{ + int ret; - return ret; + ret = epoll_ctl(cq->ep_fd, EPOLL_CTL_DEL, wait_fd, NULL); + if (ret < 0) { + ret = errno; + CXIP_WARN("EP wait FD delete from CQ failed %d\n", ret); + } } /* @@ -370,6 +476,7 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, { struct cxip_domain *cxi_dom; struct cxip_cq *cxi_cq; + struct fi_cq_attr temp_attr; int ret; if (!domain || !cq) @@ -393,18 +500,21 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, cxi_cq->attr = *attr; } - ret = ofi_cq_init(&cxip_prov, domain, &cxi_cq->attr, &cxi_cq->util_cq, + /* CXI does not use common code internal wait object */ + temp_attr = cxi_cq->attr; + temp_attr.wait_obj = FI_WAIT_NONE; + ret = ofi_cq_init(&cxip_prov, domain, &temp_attr, &cxi_cq->util_cq, cxip_util_cq_progress, context); if (ret != FI_SUCCESS) { CXIP_WARN("ofi_cq_init() failed: %d\n", ret); goto err_util_cq; } - cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops; - + cxi_cq->util_cq.cq_fid.ops = &cxip_cq_ops; cxi_cq->domain = cxi_dom; cxi_cq->ack_batch_size = cxip_env.eq_ack_batch_size; + cxi_cq->ep_fd = -1; /* Optimize locking when possible */ if (cxi_dom->util_domain.threading == FI_THREAD_DOMAIN || @@ -413,11 +523,11 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, else ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_SPINLOCK); - if (cxi_cq->util_cq.wait) { - ret = cxip_cq_alloc_priv_wait(cxi_cq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate CXI wait obj: %d\n", - ret); + if (cxi_cq->attr.wait_obj) { + cxi_cq->ep_fd = epoll_create1(0); + if (cxi_cq->ep_fd < 0) { + CXIP_WARN("Unable to open epoll FD: %s\n", + strerror(errno)); goto err_wait_alloc; } } diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c index b60858742b7..03b117b7ef4 100644 --- a/prov/cxi/src/cxip_ctrl.c +++ b/prov/cxi/src/cxip_ctrl.c @@ -406,36 +406,6 @@ void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, true); } -/* - * cxip_ep_ctrl_trywait() - Return 0 if no events need to be progressed. - */ -int cxip_ep_ctrl_trywait(void *arg) -{ - struct cxip_ep_obj *ep_obj = (struct cxip_ep_obj *)arg; - - if (!ep_obj->ctrl.wait) { - CXIP_WARN("No CXI ep_obj wait object\n"); - return -FI_EINVAL; - } - - if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) - return -FI_EAGAIN; - - ofi_genlock_lock(&ep_obj->lock); - cxil_clear_wait_obj(ep_obj->ctrl.wait); - - if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) { - ofi_genlock_unlock(&ep_obj->lock); - - return -FI_EAGAIN; - } - ofi_genlock_unlock(&ep_obj->lock); - - return FI_SUCCESS; -} - static void cxip_eq_ctrl_eq_free(void *eq_buf, struct cxi_md *eq_md, struct cxi_eq *eq) { @@ -484,7 +454,7 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, /* ep_obj->ctrl.wait will be NULL if not required */ ret = cxil_alloc_evtq(ep_obj->domain->lni->lni, *eq_md, &eq_attr, - ep_obj->ctrl.wait, NULL, eq); + ep_obj->priv_wait, NULL, eq); if (ret) goto err_free_eq_md; @@ -500,107 +470,6 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, return ret; } -/* - * cxip_ep_wait_required() - return true if base EP wait object is required. - */ -static bool cxip_ctrl_wait_required(struct cxip_ep_obj *ep_obj) -{ - if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->priv_wait) - return true; - - if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->priv_wait) - return true; - - return false; -} - -/* - * cxip_ep_ctrl_del_wait() - Delete control FD object - */ -void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj) -{ - int wait_fd; - - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); - - if (ep_obj->txc->send_cq) { - ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); - CXIP_DBG("Deleted control HW EQ FD: %d from CQ: %p\n", - wait_fd, ep_obj->txc->send_cq); - } - - if (ep_obj->rxc->recv_cq && - ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) { - ofi_wait_del_fd(ep_obj->rxc->recv_cq->util_cq.wait, wait_fd); - CXIP_DBG("Deleted control HW EQ FD: %d from CQ %p\n", - wait_fd, ep_obj->rxc->recv_cq); - } -} - -/* - * cxip_ep_ctrl_add_wait() - Add control FD to CQ object - */ -int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) -{ - struct cxip_cq *cq; - int wait_fd; - int ret; - - ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl.wait); - if (ret) { - CXIP_WARN("Control wait object allocation failed: %d\n", ret); - return -FI_ENOMEM; - } - - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); - ret = fi_fd_nonblock(wait_fd); - if (ret) { - CXIP_WARN("Unable to set control wait non-blocking: %d, %s\n", - ret, fi_strerror(-ret)); - goto err; - } - - cq = ep_obj->txc->send_cq; - if (cq) { - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, - POLLIN, cxip_ep_ctrl_trywait, ep_obj, - &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("TX CQ add FD failed: %d, %s\n", - ret, fi_strerror(-ret)); - goto err; - } - } - - if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq != cq) { - cq = ep_obj->rxc->recv_cq; - - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, - POLLIN, cxip_ep_ctrl_trywait, ep_obj, - &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("RX CQ add FD failed: %d, %s\n", - ret, fi_strerror(-ret)); - goto err_add_fd; - } - } - - CXIP_DBG("Added control EQ private wait object, intr FD: %d\n", - wait_fd); - - return FI_SUCCESS; - -err_add_fd: - if (ep_obj->txc->send_cq) - ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); -err: - cxil_destroy_wait_obj(ep_obj->ctrl.wait); - ep_obj->ctrl.wait = NULL; - - return ret; -} - /* * cxip_ep_ctrl_init() - Initialize endpoint control resources. * @@ -624,21 +493,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) if (ep_obj->domain->mr_match_events) pt_opts.en_event_match = 1; - /* If CQ(s) are using a wait object, then control event - * queues need to unblock CQ poll as well. CQ will add the - * associated FD to the CQ FD list. - */ - if (cxip_ctrl_wait_required(ep_obj)) { - ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl.wait); - if (ret) { - CXIP_WARN("EP ctrl wait object alloc failed: %d\n", - ret); - return ret; - } - } - - ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * s_page_size, + ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * sc_page_size, &ep_obj->ctrl.tx_evtq_buf, &ep_obj->ctrl.tx_evtq_buf_md, &ep_obj->ctrl.tx_evtq); @@ -694,7 +549,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } ret = cxip_pte_set_state(ep_obj->ctrl.pte, ep_obj->ctrl.tgq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c index 97fcfcfac10..f3954327651 100644 --- a/prov/cxi/src/cxip_curl.c +++ b/prov/cxi/src/cxip_curl.c @@ -1,24 +1,31 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ #include +#include #include #include #include #include #include #include +#include +#include #include #include "cxip.h" +static void *cxip_curlhandle; +static CURLM *cxip_curlm; +static int cxip_curl_count; #define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ ##__VA_ARGS__) #define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) #define CHUNK_SIZE 4096 @@ -117,28 +124,164 @@ static size_t write_callback(void *curl_rcvd, size_t size, size_t nmemb, * The CURL library must be explicitly initialized. It is application-global, * and the initialization is not thread-safe, according to the documentation. We * do not protect this call, because it is running under CXI_INIT (see - * cxip_info.c), which is single-threaded. The curl_global_init() call can be + * cxip_info.c), which is single-threaded. The (*dl_curl_global_init)() call can be * issued multiple times (non-concurrently) and has the same end result as * calling it once. */ -static CURLM *cxip_curlm; -static int cxip_curl_count; /** * Initialize CURL globally for the application, enabling multi-curl * (concurrent calls). */ + +/* Each of these should be referenced in curlary[] below */ +CURLcode (*dl_curl_global_init)(long); +void (*dl_curl_global_cleanup)(void); +CURL * (*dl_curl_easy_init)(void); +void (*dl_curl_easy_cleanup)(CURL *); +CURLcode (*dl_curl_easy_getinfo)(CURL *, CURLINFO, ...); +CURLcode (*dl_curl_easy_setopt)(CURL *, CURLoption, ...); +const char *(*dl_curl_easy_strerror)(CURLcode); +CURLcode (*dl_curl_easy_perform)(CURL *); +CURLM * (*dl_curl_multi_init)(void); +CURLMcode (*dl_curl_multi_cleanup)(CURLM *); +CURLMcode (*dl_curl_multi_add_handle)(CURLM *multi_handle, CURL *); +CURLMsg * (*dl_curl_multi_info_read)(CURLM *multi_handle, int *); +CURLMcode (*dl_curl_multi_perform)(CURLM *multi_handle, int *); +const char *(*dl_curl_multi_strerror)(CURLMcode); +struct curl_slist *(*dl_curl_slist_append)(struct curl_slist *, const char *); +void (*dl_curl_slist_free_all)(struct curl_slist *); + +struct curlfunc { + void **fptr; + char *name; +}; + +struct curlfunc curlary[] = { + {(void **)&dl_curl_global_init, "curl_global_init"}, + {(void **)&dl_curl_global_cleanup, "curl_global_cleanup"}, + {(void **)&dl_curl_easy_init, "curl_easy_init"}, + {(void **)&dl_curl_easy_cleanup, "curl_easy_cleanup"}, + {(void **)&dl_curl_easy_getinfo, "curl_easy_getinfo"}, + {(void **)&dl_curl_easy_setopt, "curl_easy_setopt"}, + {(void **)&dl_curl_easy_strerror, "curl_easy_strerror"}, + {(void **)&dl_curl_easy_perform, "curl_easy_perform"}, + {(void **)&dl_curl_multi_init, "curl_multi_init"}, + {(void **)&dl_curl_multi_cleanup, "curl_multi_cleanup"}, + {(void **)&dl_curl_multi_add_handle, "curl_multi_add_handle"}, + {(void **)&dl_curl_multi_info_read, "curl_multi_info_read"}, + {(void **)&dl_curl_multi_perform, "curl_multi_perform"}, + {(void **)&dl_curl_multi_strerror, "curl_multi_strerror"}, + {(void **)&dl_curl_slist_append, "curl_slist_append"}, + {(void **)&dl_curl_slist_free_all, "curl_slist_free_all"}, + {NULL, NULL} +}; + +int cxip_curl_load_symbols(void) +{ + struct curlfunc *funcptr; + char *libfile = NULL, *libpath; + int version; + int errcnt; + void *h; + + /* load successfully only once */ + if (cxip_curlhandle) + return 0; + + char *curl_libpath = NULL; + #ifdef FI_CXI_CURL_LIB_PATH + curl_libpath = strdup(FI_CXI_CURL_LIB_PATH "/%s/libcurl.so.%d"); + TRACE_CURL("FI_CXI_CURL_LIB_PATH set to '%s'\n", curl_libpath); + #else + curl_libpath = strdup("/usr/%s/libcurl.so.%d"); + #endif + + /* Try to find latest usable version */ + // TODO test earlier versions + for (version = 4; version >= 4; version--) { + const char *lib_dirs[] = {"lib", "lib64"}; + for (int i = 0; i < 2; i++) { + int len = snprintf(NULL, 0, curl_libpath, lib_dirs[i], version) + 1; + libfile = malloc(len); + if (!libfile) { + free(curl_libpath); + return -FI_ENOMEM; + } + snprintf(libfile, len, curl_libpath, lib_dirs[i], version); + TRACE_CURL("Checking libcurl at '%s'\n", libfile); + libpath = realpath(libfile, NULL); + if (!libpath) { + TRACE_CURL("could not expand '%s'\n", libfile); + CXIP_INFO("could not expand '%s'\n", libfile); + free(libfile); + continue; + } + TRACE_CURL("dlopen '%s'\n", libpath); + h = dlopen(libpath, RTLD_NOW); + if (!h) { + TRACE_CURL("%s not found\n", libpath); + CXIP_INFO("%s not found\n", libpath); + free(libpath); + free(libfile); + continue; + } + TRACE_CURL("%s found\n", libpath); + free(libpath); + free(libfile); + break; + } + if (h) { + break; + } + } + free(curl_libpath); + if (!h) { + TRACE_CURL("libcurl not supported\n"); + CXIP_WARN("libcurl not supported\n"); + CXIP_WARN("Accelerated collectives cannot be enabled\n"); + return -FI_EOPNOTSUPP; + } + /* Load all the necessary functions, or none */ + errcnt = 0; + funcptr = curlary; + while (funcptr->fptr) { + *funcptr->fptr = dlsym(h, funcptr->name); + if (!(*funcptr->fptr)) { + CXIP_WARN("curl function '%s' not found\n", + funcptr->name); + errcnt++; + } + funcptr++; + } + if (errcnt) { + funcptr = curlary; + while (funcptr->fptr) + *funcptr->fptr = NULL; + CXIP_WARN("libcurl incomplete support\n"); + return -FI_EOPNOTSUPP; + } + /* record handle to prevent reloading */ + cxip_curlhandle = h; + return 0; +} + int cxip_curl_init(void) { - int ret = FI_SUCCESS; CURLcode res; + int ret; + + /* can be safely called multiple times */ + ret = cxip_curl_load_symbols(); + if (ret) + return ret; if (!cxip_curlm) { - res = curl_global_init(CURL_GLOBAL_DEFAULT); + res = (*dl_curl_global_init)(CURL_GLOBAL_DEFAULT); if (res == CURLE_OK) { - cxip_curlm = curl_multi_init(); + cxip_curlm = (*dl_curl_multi_init)(); if (!cxip_curlm) { - curl_global_cleanup(); + (*dl_curl_global_cleanup)(); ret = -FI_EINVAL; } } else @@ -154,8 +297,8 @@ void cxip_curl_fini(void) { cxip_curl_count = 0; if (cxip_curlm) { - curl_multi_cleanup(cxip_curlm); - curl_global_cleanup(); + (*dl_curl_multi_cleanup)(cxip_curlm); + (*dl_curl_global_cleanup)(); cxip_curlm = NULL; } } @@ -207,7 +350,11 @@ void cxip_curl_free(struct cxip_curl_handle *handle) * The usrfunc is called in cxip_curl_progress() when the request completes, * and receives the handle as its sole argument. The handle also contains an * arbitrary usrptr supplied by the caller. This usrptr can contain specific - * information to identify which of multiple concurrent requests has completed. + * user information to identify which of multiple concurrent requests has + * completed. + * + * An error return indicates that the dispatch was unsuccessful. All memory + * cleanup is done here. * * There are no "normal" REST errors from this call. REST errors are instead * returned on attempts to progress the dispatched operation. @@ -220,7 +367,9 @@ void cxip_curl_free(struct cxip_curl_handle *handle) * @param userfunc : user-defined completion function * @param usrptr : user-defined data pointer * - * @return int : 0 on success, -1 on failure + * @return int : 0 on success, -errno on failure + * -FI_ENOMEM : out-of-memory + * -FI_ECONNREFUSED : CURL easy/multi init failed */ int cxip_curl_perform(const char *endpoint, const char *request, const char *sessionToken, size_t rsp_init_size, @@ -230,125 +379,177 @@ int cxip_curl_perform(const char *endpoint, const char *request, struct cxip_curl_handle *handle; struct curl_slist *headers; char *token; - char *verify_peer_str; - int verify_peer; + char *cert_env_var; + bool verify = true; + bool isdir = false; + bool isfile = false; + struct stat buf; CURLMcode mres; CURL *curl; int running; int ret; - ret = -FI_ENOMEM; handle = calloc(1, sizeof(*handle)); - if (!handle) + if (!handle) { + ret = -FI_ENOMEM; goto fail; + } /* libcurl is fussy about NULL requests */ handle->endpoint = strdup(endpoint); - if (!handle->endpoint) + if (!handle->endpoint) { + ret = -FI_ENOMEM; goto fail; + } handle->request = strdup(request ? request : ""); - if (!handle->request) + if (!handle->request) { + ret = -FI_ENOMEM; goto fail; + } handle->response = NULL; handle->recv = (void *)init_curl_buffer(rsp_init_size); - if (!handle->recv) + if (!handle->recv) { + ret = -FI_ENOMEM; goto fail; + } + /* add user completion function and pointer */ handle->usrfunc = usrfunc; handle->usrptr = usrptr; - ret = -FI_EACCES; - curl = curl_easy_init(); + curl = (*dl_curl_easy_init)(); if (!curl) { - CXIP_WARN("curl_easy_init() failed\n"); + CXIP_WARN("(*dl_curl_easy_init)() failed\n"); + ret = -FI_ECONNREFUSED; goto fail; } /* HTTP 1.1 assumed */ headers = NULL; - headers = curl_slist_append(headers, "Expect:"); - headers = curl_slist_append(headers, "Accept: application/json"); - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, "charset: utf-8"); + headers = (*dl_curl_slist_append)(headers, "Expect:"); + headers = (*dl_curl_slist_append)(headers, "Accept: application/json"); + headers = (*dl_curl_slist_append)(headers, "Content-Type: application/json"); + headers = (*dl_curl_slist_append)(headers, "charset: utf-8"); token = NULL; if (sessionToken) { ret = asprintf(&token, "Authorization: Bearer %s", sessionToken); if (ret < 0) { CXIP_WARN("token string create failed\n"); + ret = -FI_ENOMEM; goto fail; } - headers = curl_slist_append(headers, token); + headers = (*dl_curl_slist_append)(headers, token); } handle->headers = (void *)headers; - curl_easy_setopt(curl, CURLOPT_URL, handle->endpoint); + (*dl_curl_easy_setopt)(curl, CURLOPT_URL, handle->endpoint); if (op == CURL_GET) { - curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_HTTPGET, 1L); + } else if (op == CURL_DELETE) { + (*dl_curl_easy_setopt)(curl, CURLOPT_CUSTOMREQUEST, "DELETE"); } else { - curl_easy_setopt(curl, CURLOPT_POST, 1L); - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, handle->request); - curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, + (*dl_curl_easy_setopt)(curl, CURLOPT_POST, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_POSTFIELDS, handle->request); + (*dl_curl_easy_setopt)(curl, CURLOPT_POSTFIELDSIZE, strlen(handle->request)); } - curl_easy_setopt(curl, CURLOPT_STDERR, stderr); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, handle->recv); - curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *)handle); - curl_easy_setopt(curl, CURLOPT_VERBOSE, (long)verbose); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); - - verify_peer_str = getenv("CURLOPT_SSL_VERIFYPEER"); - if (verify_peer_str) - verify_peer = atoi(verify_peer_str); - else - verify_peer = 0; - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, verify_peer); - - curl_multi_add_handle(cxip_curlm, curl); - mres = curl_multi_perform(cxip_curlm, &running); + (*dl_curl_easy_setopt)(curl, CURLOPT_STDERR, stderr); + (*dl_curl_easy_setopt)(curl, CURLOPT_HTTPHEADER, headers); + (*dl_curl_easy_setopt)(curl, CURLOPT_WRITEFUNCTION, write_callback); + (*dl_curl_easy_setopt)(curl, CURLOPT_WRITEDATA, handle->recv); + (*dl_curl_easy_setopt)(curl, CURLOPT_PRIVATE, (void *)handle); + (*dl_curl_easy_setopt)(curl, CURLOPT_VERBOSE, (long)verbose); + (*dl_curl_easy_setopt)(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); + + /* Value of fm_cacert variable in slurmctld configuration */ + /* If set to 'yes' or a path, the CACERT will be validated and used for the connection */ + cert_env_var = getenv("FI_CXI_COLL_FABRIC_MGR_CACERT"); + + if (!cert_env_var || !strcmp(cert_env_var, "no")) + verify = false; + else if (!strcmp(cert_env_var, "yes")) + verify = true; + else { + if (stat(cert_env_var, &buf) == -1) { + ret = FI_ENOENT; + goto fail; + } + if (S_ISDIR(buf.st_mode)) + isdir = true; + else if (S_ISREG(buf.st_mode)) + isfile = true; + else { + ret = FI_EINVAL; + goto fail; + } + } + + if (!verify) { + /* These are needed to work with self-signed certificates */ + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYPEER, 0L); + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYHOST, 0L); + } else { + /* FI_CXI_COLL_FABRIC_MGR_CACERT is "yes" or a pathname */ + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYPEER, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYHOST, 2L); + } + + /* If certificate file/dir specified, use it */ + if (isdir) + (*dl_curl_easy_setopt)(curl, CURLOPT_CAPATH, cert_env_var); + else if (isfile) + (*dl_curl_easy_setopt)(curl, CURLOPT_CAINFO, cert_env_var); + + (*dl_curl_multi_add_handle)(cxip_curlm, curl); + mres = (*dl_curl_multi_perform)(cxip_curlm, &running); if (mres != CURLM_OK) { - CXIP_WARN("curl_multi_perform() failed: %s\n", - curl_multi_strerror(mres)); + CXIP_WARN("(*dl_curl_multi_perform)() failed: %s\n", + (*dl_curl_multi_strerror)(mres)); + ret = -FI_ECONNREFUSED; goto fail; } cxip_curl_count += 1; return FI_SUCCESS; fail: - CXIP_WARN("%s failed %d\n", __func__, ret); + CXIP_WARN("%s failed %d (%s)\n", __func__, ret, fi_strerror(ret)); cxip_curl_free(handle); return ret; } /** - * Progress the CURL requests. + * Progress the pending CURL requests. * * This progresses concurrent CURL requests, and returns the following: * - * - 0 indicates an operation completed - * - -FI_EAGAIN indicates operations are pending, none completed - * - -FI_ENODATA indicates no operations are pending - * - -errorcode a fatal error + * - 0 success + * - -FI_EAGAIN indicates operations are pending, none completed + * - -FI_ENODATA indicates no operations are pending + * - -FI_ECONNREFUSED fatal error, CURL is not functioning properly * - * Repeated calls will return additional completions, until there are no more - * pending and -FI_ENODATA is returned. + * Note that -FI_ECONNREFUSED should be treated as a fatal CURL error. It + * indicates that CURL is behaving in an abnormal fashion, and cannot be + * relied upon. In normal use, it should not happen. * - * Note that a CURL request will succeed if the server is not reachable. It will - * return a handle->status value of 0, which is an invalid HTTP status, and - * indicates that it could not connect to a server. + * All other error handling is performed by the usrfunc function (supplied + * during cxip_curl_perform() call), see below. * - * For unit testing, it is useful for the test to be able to inspect the handle - * directly, and it can be obtained by specifying a non-null handleptr value. If - * handleptr is supplied, the caller is responsible for calling cxip_curl_free() - * on the returned handle. In normal usage, handleptr is NULL, and this routine - * will clean up the handle after the operation completes. + * A CURL request will complete if the server is not reachable. It will return a + * handle->status value of 0, which is an invalid HTTP status, and indicates + * that it could not connect to a server. * - * The user should provide a callback routine to examine the final state of the - * CURL request, as well as any data it returns: see cxip_curl_perform(). This - * user callback is called after completion of the request, before the handle is - * destroyed. + * In normal use, handleptr is NULL. the caller has passed a a usrfunc callback + * routine when dispatching the CURL request to process the returned errors and + * data: see cxip_curl_perform(). This usrfunc callback is called after + * completion of the request, before the handle is destroyed, and is expected to + * know enough about CURL operations to interpret the results. This routine will + * delete the handle after the callback has processed it. + * + * For unit testing, it can be useful for the test to be able to inspect the + * handle and the error return, and it can be obtained by specifying a non-null + * handleptr. If handleptr is supplied, the caller is responsible for + * calling cxip_curl_free() on the returned handle. * * The callback routine has read-only access to the handle, and read-write * access to its own data area, available as handle->usrptr. @@ -356,7 +557,7 @@ int cxip_curl_perform(const char *endpoint, const char *request, * The handle contains the following documented fields: * * - status = HTTP status of the op, or 0 if the endpoint could not be reached - * - endpoint = copy of the endpoint address supplied for the post + * - endpoint = copy of the endpoint address (URL) supplied for the post * - request = copy of the JSON request data supplied for the post * - response = pointer to the JSON response returned by the endpoint * - usrptr = arbitrary user pointer supplied during CURL request @@ -379,55 +580,57 @@ int cxip_curl_progress(struct cxip_curl_handle **handleptr) if (!cxip_curl_count) return -FI_ENODATA; - handle = NULL; - /* running returns the number of curls running */ - mres = curl_multi_perform(cxip_curlm, &running); + mres = (*dl_curl_multi_perform)(cxip_curlm, &running); if (mres != CURLM_OK) { - CXIP_WARN("curl_multi_perform() failed: %s\n", - curl_multi_strerror(mres)); - return -FI_EOTHER; + CXIP_WARN("(*dl_curl_multi_perform)() failed: %s\n", + (*dl_curl_multi_strerror)(mres)); + return -FI_ECONNREFUSED; } /* messages returns the number of additional curls finished */ - msg = curl_multi_info_read(cxip_curlm, &messages); + msg = (*dl_curl_multi_info_read)(cxip_curlm, &messages); if (!msg || msg->msg != CURLMSG_DONE) { return (running) ? -FI_EAGAIN : -FI_ENODATA; } + /* These should not occur, but if (*dl_curl_easy_getinfo)() succeeds, we + * don't really care. Just post a warning. + */ if (msg->data.result >= CURL_LAST) { CXIP_WARN("CURL unknown result %d\n", msg->data.result); - } - else if (msg->data.result > CURLE_OK) { + } else if (msg->data.result > CURLE_OK) { CXIP_WARN("CURL error '%s'\n", - curl_easy_strerror(msg->data.result)); + (*dl_curl_easy_strerror)(msg->data.result)); } + /* retrieve our handle from the private pointer */ - res = curl_easy_getinfo(msg->easy_handle, + handle = NULL; + res = (*dl_curl_easy_getinfo)(msg->easy_handle, CURLINFO_PRIVATE, (char **)&handle); if (res != CURLE_OK) { - TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_PRIVATE", curl_easy_strerror(res)); - CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_PRIVATE", curl_easy_strerror(res)); - return -FI_EOTHER; + TRACE_CURL("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_PRIVATE", (*dl_curl_easy_strerror)(res)); + CXIP_WARN("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_PRIVATE", (*dl_curl_easy_strerror)(res)); + return -FI_ECONNREFUSED; } /* handle is now valid, must eventually be freed */ /* retrieve the status code, should not fail */ - res = curl_easy_getinfo(msg->easy_handle, + res = (*dl_curl_easy_getinfo)(msg->easy_handle, CURLINFO_RESPONSE_CODE, &status); if (res != CURLE_OK) { - TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); - CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); + TRACE_CURL("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", (*dl_curl_easy_strerror)(res)); + CXIP_WARN("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", (*dl_curl_easy_strerror)(res)); /* continue, handle->status should show zero */ } - TRACE_CURL("curl_easy_getinfo() success\n"); + TRACE_CURL("(*dl_curl_easy_getinfo)() success\n"); /* we can recover resources now */ - curl_slist_free_all((struct curl_slist *)handle->headers); - curl_easy_cleanup(msg->easy_handle); + (*dl_curl_slist_free_all)((struct curl_slist *)handle->headers); + (*dl_curl_easy_cleanup)(msg->easy_handle); handle->headers = NULL; /* make sure response string is terminated */ diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 4a928018679..798a0c89fa5 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -395,7 +395,7 @@ int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom, */ key.events = mr->count_events || mr->rma_events || mr->cntr; - key.opt = cxip_env.optimized_mrs && + key.opt = dom->optimized_mrs && key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT; mr->key = key.raw; ofi_spin_unlock(&dom->ctrl_id_lock); @@ -596,7 +596,7 @@ static int cxip_dom_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return -FI_EINVAL; dom->eq = eq; - if (flags & OFI_REG_MR) + if (flags & FI_REG_MR) dom->mr_eq = eq; return 0; @@ -1556,6 +1556,85 @@ static int cxip_query_atomic(struct fid_domain *domain, return FI_SUCCESS; } +struct fi_ops_srx_peer cxip_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = cxip_unexp_start, + .start_tag = cxip_unexp_start, + .discard_msg = cxip_no_discard, + .discard_tag = cxip_no_discard, +}; + +static int cxip_srx_close(struct fid *fid) +{ + struct cxip_domain *dom; + + dom = container_of(fid, struct cxip_domain, rx_ep.fid); + + ofi_atomic_dec32(&dom->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops cxip_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg cxip_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged cxip_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +static int cxip_srx_context(struct fid_domain *fid, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct cxip_domain *dom; + + if (!context || ! attr || !fid) + return -FI_EINVAL; + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + if (attr->op_flags & FI_PEER) { + dom->owner_srx = ((struct fi_peer_srx_context *) context)->srx; + dom->owner_srx->peer_ops = &cxip_srx_peer_ops; + dom->rx_ep.msg = &cxip_srx_msg_ops; + dom->rx_ep.tagged = &cxip_srx_tagged_ops; + dom->rx_ep.fid.ops = &cxip_srx_fi_ops; + dom->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &dom->rx_ep; + ofi_atomic_inc32(&dom->util_domain.ref); + return FI_SUCCESS; + } + + return -FI_ENOSYS; +} + static int cxip_query_collective(struct fid_domain *domain, enum fi_collective_op coll, struct fi_collective_attr *attr, @@ -1695,7 +1774,7 @@ static struct fi_ops_domain cxip_dom_ops = { .cntr_open = cxip_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = cxip_srx_context, .query_atomic = cxip_query_atomic, .query_collective = cxip_query_collective }; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index fabdea22be3..48333d02ae2 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -187,26 +187,6 @@ void cxip_ep_progress(struct fid *fid) } } -/* - * cxip_ep_peek() - Peek at EP event queues - * - * Return whether the associated EP event queues are empty. - */ -int cxip_ep_peek(struct fid *fid) -{ - struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); - struct cxip_ep_obj *ep_obj = ep->ep_obj; - - if (ep_obj->txc->tx_evtq.eq && - cxi_eq_peek_event(ep_obj->txc->tx_evtq.eq)) - return -FI_EAGAIN; - if (ep_obj->rxc->rx_evtq.eq && - cxi_eq_peek_event(ep_obj->rxc->rx_evtq.eq)) - return -FI_EAGAIN; - - return FI_SUCCESS; -} - /* * fi_ep_get_unexpected_msgs() - Get unexpected message information, exposed * via domain open ops. @@ -477,11 +457,146 @@ ssize_t cxip_ep_cancel(fid_t fid, void *context) if (!ofi_recv_allowed(ep->ep_obj->caps)) return -FI_ENOENT; + ofi_genlock_lock(&ep->ep_obj->lock); + ret = cxip_rxc_cancel(ep->ep_obj->rxc, context); if (ret != -FI_ENOENT) + goto out_unlock; + + ret = cxip_txc_cancel(ep->ep_obj->txc, context); + +out_unlock: + ofi_genlock_unlock(&ep->ep_obj->lock); + + return ret; +} + +/* + * cxip_ep_destroy_priv_wait - Free an internal wait channel for the EP. + */ +static void cxip_ep_destroy_priv_wait(struct cxip_ep_obj *ep_obj) +{ + assert(ep_obj->priv_wait); + + if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) + cxip_cq_del_wait_fd(ep_obj->txc->send_cq, ep_obj->wait_fd); + + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj && + ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) + cxip_cq_del_wait_fd(ep_obj->rxc->recv_cq, ep_obj->wait_fd); + + cxil_destroy_wait_obj(ep_obj->priv_wait); + + ep_obj->priv_wait = NULL; + ep_obj->wait_fd = -1; +} + +/* + * cxip_ep_alloc_priv_wait - Allocate an internal wait channel for the EP. + */ +static int cxip_ep_alloc_priv_wait(struct cxip_ep_obj *ep_obj) +{ + bool tx_cq_added = false; + int ret; + + assert(ep_obj->priv_wait == NULL); + + ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, &ep_obj->priv_wait); + if (ret) { + CXIP_WARN("Alloc of EP internal wait object failed %d\n", + ret); return ret; + } + + ep_obj->wait_fd = cxil_get_wait_obj_fd(ep_obj->priv_wait); + ret = fi_fd_nonblock(ep_obj->wait_fd); + if (ret) { + CXIP_WARN("Unable to set EP wait non-blocking mode: %d\n", ret); + goto destroy_wait; + } + + if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) { + ret = cxip_cq_add_wait_fd(ep_obj->txc->send_cq, ep_obj->wait_fd, + EPOLLPRI | POLLERR); + if (ret) + goto destroy_wait; + + tx_cq_added = true; + } + + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj && + ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) { + ret = cxip_cq_add_wait_fd(ep_obj->rxc->recv_cq, ep_obj->wait_fd, + EPOLLPRI | POLLERR); + if (ret) { + if (tx_cq_added) + cxip_cq_del_wait_fd(ep_obj->txc->send_cq, + ep_obj->wait_fd); + goto destroy_wait; + } + } + + CXIP_DBG("Add EP private wait object, EP intr FD: %d\n", + ep_obj->wait_fd); + + return FI_SUCCESS; + +destroy_wait: + cxil_destroy_wait_obj(ep_obj->priv_wait); + ep_obj->priv_wait = NULL; + ep_obj->wait_fd = -1; + + return ret; +} + +/* + * cxip_ep_trywait() - Determine if hardware events are waiting to be processed + * for EP based on CQ. + */ +int cxip_ep_trywait(struct cxip_ep_obj *ep_obj, struct cxip_cq *cq) +{ + assert(ep_obj->priv_wait); + + ofi_genlock_lock(&ep_obj->lock); + cxil_clear_wait_obj(ep_obj->priv_wait); + + /* Enable any currently disabled EQ interrupts, if events are + * ready shortcut and return. + */ + if ((ep_obj->txc->send_cq == cq || + ep_obj->rxc->recv_cq == cq) && ep_obj->txc->tx_evtq.eq) { + cxi_eq_int_enable(ep_obj->txc->tx_evtq.eq); + ep_obj->txc->tx_evtq.unacked_events = 0; + + if (cxi_eq_peek_event(ep_obj->txc->tx_evtq.eq)) + goto ready; + } + + if (ep_obj->rxc->recv_cq == cq && ep_obj->rxc->rx_evtq.eq) { + cxi_eq_int_enable(ep_obj->rxc->rx_evtq.eq); + ep_obj->rxc->rx_evtq.unacked_events = 0; + + if (cxi_eq_peek_event(ep_obj->rxc->rx_evtq.eq)) + goto ready; + } + + /* Side band control messages can also require progress */ + cxi_eq_int_enable(ep_obj->ctrl.tx_evtq); + if (cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) + goto ready; + + cxi_eq_int_enable(ep_obj->ctrl.tgt_evtq); + if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq)) + goto ready; + + ofi_genlock_unlock(&ep_obj->lock); - return cxip_txc_cancel(ep->ep_obj->txc, context); + return FI_SUCCESS; + +ready: + ofi_genlock_unlock(&ep_obj->lock); + + return -FI_EAGAIN; } /* @@ -497,10 +612,23 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) if (ep_obj->enabled) goto unlock; + /* Allocate an EP internal wait object if a CQ is bound with a + * wait object specified. + */ + if ((ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) || + (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj)) { + ret = cxip_ep_alloc_priv_wait(ep_obj); + if (ret) { + CXIP_WARN("EP internal wait alloc failed %s\n", + fi_strerror(-ret)); + goto unlock; + } + } + if (!ep_obj->av) { CXIP_WARN("Endpoint must be bound to an AV\n"); ret = -FI_ENOAV; - goto unlock; + goto free_wait; } assert(ep_obj->domain->enabled); @@ -510,7 +638,7 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) ret = cxip_av_auth_key_get_vnis(ep_obj->av, &ep_obj->vnis, &ep_obj->vni_count); if (ret) - goto unlock; + goto free_wait; ret = cxip_portals_table_alloc(ep_obj->domain->lni, ep_obj->vnis, ep_obj->vni_count, @@ -534,7 +662,7 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate portals table: %d\n", ret); - goto unlock; + goto free_wait; } } @@ -618,6 +746,10 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) ep_obj->vni_count); ep_obj->vnis = NULL; } +free_wait: + if (ep_obj->priv_wait) + cxip_ep_destroy_priv_wait(ep_obj); + unlock: ofi_genlock_unlock(&ep_obj->lock); @@ -681,6 +813,8 @@ int cxip_free_endpoint(struct cxip_ep *ep) cxip_txc_close(ep); cxip_rxc_close(ep); cxip_ep_disable(ep_obj); + if (ep_obj->priv_wait) + cxip_ep_destroy_priv_wait(ep_obj); ofi_genlock_unlock(&ep_obj->lock); ofi_atomic_dec32(&ep_obj->domain->ref); @@ -688,6 +822,7 @@ int cxip_free_endpoint(struct cxip_ep *ep) cxip_txc_free(ep_obj->txc); cxip_rxc_free(ep_obj->rxc); + free(ep_obj); ep->ep_obj = NULL; @@ -918,6 +1053,10 @@ int cxip_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; + case FI_CLASS_SRX_CTX: + ep->ep_obj->owner_srx = ep->ep_obj->domain->owner_srx; + break; + default: return -FI_EINVAL; } @@ -963,7 +1102,7 @@ static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep, uint64_t *req_order; uint64_t *req_rnr_max_time; uint32_t *req_tclass; - uint32_t new_tclass; + uint32_t new_tclass = FI_TC_UNSPEC; if (!val->val) return -FI_EINVAL; @@ -1107,6 +1246,15 @@ int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname, *optlen = sizeof(size_t); break; + case FI_OPT_CUDA_API_PERMITTED: + if (!optval || !optlen) + return -FI_EINVAL; + if (*optlen < sizeof(bool)) + return -FI_ETOOSMALL; + + *(bool *)optval = + !ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA]; + break; default: return -FI_ENOPROTOOPT; } @@ -1129,6 +1277,7 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, const void *optval, size_t optlen) { size_t min_multi_recv; + bool cuda_api_permitted; if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; @@ -1147,6 +1296,30 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, } ep->ep_obj->rxc->min_multi_recv = min_multi_recv; break; + /* + * If GDRCopy is required by the application (ie. it has set + * FI_OPT_CUDA_API_PERMITTED), and is not available, return not + * supported. + */ + case FI_OPT_CUDA_API_PERMITTED: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + + if (!hmem_ops[FI_HMEM_CUDA].initialized) { + CXIP_WARN("FI_OPT_CUDA_API_PERMITTED cannot be set when CUDA library or CUDA device is not available\n"); + return -FI_EOPNOTSUPP; + } + + cuda_api_permitted = *(bool *)optval; + + if (!cuda_api_permitted && !cuda_is_gdrcopy_enabled()) + return -FI_EOPNOTSUPP; + + if (!cxip_env.force_dev_reg_copy) { + ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] = + !cuda_api_permitted; + } + break; default: return -FI_ENOPROTOOPT; @@ -1185,7 +1358,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, { int ret; struct cxip_ep_obj *ep_obj; - uint32_t txc_tclass; + uint32_t txc_tclass = FI_TC_UNSPEC; uint32_t nic; uint32_t pid; int i; @@ -1232,6 +1405,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->tgq_size = hints->rx_attr->size; ep_obj->tx_attr = *hints->tx_attr; ep_obj->rx_attr = *hints->rx_attr; + ep_obj->wait_fd = -1; ep_obj->asic_ver = cxip_dom->iface->info->cassini_version; @@ -1249,6 +1423,12 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->src_addr.pid = pid; ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + /* Default to allowing non-dev reg copy APIs unless the caller + * disables it. + */ + for (i = 0; i < OFI_HMEM_MAX; i++) + ep_obj->require_dev_reg_copy[i] = cxip_env.force_dev_reg_copy; + ofi_atomic_initialize32(&ep_obj->txq_ref, 0); ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); @@ -1321,6 +1501,26 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, return ret; } +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md) +{ + struct cxip_domain *dom = ep->domain; + int ret; + + ret = cxip_map(dom, buf, len, flags, md); + if (ret != FI_SUCCESS) + return ret; + + if (ep->require_dev_reg_copy[(*md)->info.iface] && + !((*md)->handle_valid)) { + CXIP_WARN("Required dev registration copy failed\n"); + cxip_unmap(*md); + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + /* * cxip_endpoint() - Provider fi_endpoint() implementation. */ diff --git a/prov/cxi/src/cxip_eq.c b/prov/cxi/src/cxip_eq.c index 61aad506663..6c1dec45319 100644 --- a/prov/cxi/src/cxip_eq.c +++ b/prov/cxi/src/cxip_eq.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Cray Inc. All rights reserved. */ /* @@ -29,6 +29,8 @@ #include "cxip.h" +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EQ, __VA_ARGS__) + static int cxip_eq_close(struct fid *fid) { struct cxip_eq *cxi_eq; @@ -58,6 +60,18 @@ static void cxip_eq_progress(struct cxip_eq *eq) ofi_mutex_unlock(&eq->list_lock); } +/* cxip_cq_strerror() - Converts provider specific error information into a + * printable string. Not eq-specific. + */ +static const char *cxip_eq_strerror(struct fid_eq *eq, int prov_errno, + const void *err_data, char *buf, size_t len) +{ + const char *errmsg = cxip_strerror(prov_errno); + if (buf && len > 0) + strncpy(buf, errmsg, len); + return errmsg; +} + ssize_t cxip_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags) { @@ -78,7 +92,7 @@ static struct fi_ops_eq cxi_eq_ops = { .readerr = ofi_eq_readerr, .sread = ofi_eq_sread, .write = ofi_eq_write, - .strerror = ofi_eq_strerror, + .strerror = cxip_eq_strerror, // customized }; static struct fi_ops cxi_eq_fi_ops = { @@ -92,7 +106,7 @@ static struct fi_ops cxi_eq_fi_ops = { static struct fi_eq_attr cxip_eq_def_attr = { .size = CXIP_EQ_DEF_SZ, .flags = 0, - .wait_obj = FI_WAIT_FD, + .wait_obj = FI_WAIT_NONE, .signaling_vector = 0, .wait_set = NULL }; @@ -112,6 +126,14 @@ int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, else cxi_eq->attr = *attr; + if (cxi_eq->attr.wait_obj != FI_WAIT_NONE) { + CXIP_WARN("Unsupported EQ attribute wait obj %d\n", + cxi_eq->attr.wait_obj); + ret = -FI_ENOSYS; + + goto err0; + } + ret = ofi_eq_init(fabric, &cxi_eq->attr, &cxi_eq->util_eq.eq_fid, context); if (ret != FI_SUCCESS) diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c index c40dd7e7c2f..42384ca85a8 100644 --- a/prov/cxi/src/cxip_evtq.c +++ b/prov/cxi/src/cxip_evtq.c @@ -155,7 +155,7 @@ void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq) req->type); } - ofi_atomic_dec32(&txc->otx_reqs); + cxip_txc_otx_reqs_dec(txc); cxip_evtq_req_free_no_lock(req); } @@ -457,7 +457,8 @@ static size_t cxip_evtq_get_queue_size(struct cxip_cq *cq, size_t num_events) #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, - size_t num_events, size_t num_fc_events) + size_t num_events, size_t num_fc_events, + struct cxil_wait_obj *priv_wait) { struct cxi_eq_attr eq_attr = { .reserved_slots = num_fc_events, @@ -561,7 +562,7 @@ int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, /* cq->priv_wait is NULL if not backed by wait object */ ret = cxil_alloc_evtq(cq->domain->lni->lni, evtq->md, &eq_attr, - cq->priv_wait, NULL, &evtq->eq); + priv_wait, NULL, &evtq->eq); if (ret) { CXIP_WARN("Failed to allocated EQ: %d\n", ret); goto err_unmap_eq_buf; diff --git a/prov/cxi/src/cxip_fabric.c b/prov/cxi/src/cxip_fabric.c index c8528cf829c..b9eede784a4 100644 --- a/prov/cxi/src/cxip_fabric.c +++ b/prov/cxi/src/cxip_fabric.c @@ -24,13 +24,41 @@ int cxip_eq_def_sz = CXIP_EQ_DEF_SZ; static int read_default_params; +static int cxip_trywait(struct fid_fabric *fabric, struct fid **fids, + int count) +{ + struct cxip_cq *cq; + int ret; + int i; + + for (i = 0; i < count; i++) { + switch (fids[i]->fclass) { + case FI_CLASS_CQ: + cq = container_of(fids[i], struct cxip_cq, + util_cq.cq_fid.fid); + ret = cxip_cq_trywait(cq); + if (ret) + return ret; + break; + case FI_CLASS_EQ: + case FI_CLASS_CNTR: + case FI_CLASS_WAIT: + return -FI_ENOSYS; + default: + return -FI_EINVAL; + } + } + + return FI_SUCCESS; +} + static struct fi_ops_fabric cxip_fab_ops = { .size = sizeof(struct fi_ops_fabric), .domain = cxip_domain, .passive_ep = fi_no_passive_ep, .eq_open = cxip_eq_open, - .wait_open = ofi_wait_fd_open, - .trywait = ofi_trywait, + .wait_open = fi_no_wait_open, + .trywait = cxip_trywait, }; static int cxip_fabric_close(fid_t fid) diff --git a/prov/cxi/src/cxip_faults.c b/prov/cxi/src/cxip_faults.c index 04564b1bd04..8c273d7a203 100644 --- a/prov/cxi/src/cxip_faults.c +++ b/prov/cxi/src/cxip_faults.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019-2024 Hewlett Packard Enterprise Development LP */ /* Fault injection. */ @@ -64,7 +64,91 @@ void cxip_fault_inject_fini(void) fault_fini(&malloc_fault); } +/****************************************************/ + +/* structure used to simulate failures */ +struct cxip_trap { + struct dlist_entry link; + int index; + int trap; + int err; + int prov_errno; +}; + +struct dlist_entry trap_list; +bool trap_initialized; + +void cxip_trap_close(void) +{ + struct cxip_trap *trap_obj; + + if (!trap_initialized) + return; + while (!dlist_empty(&trap_list)) { + dlist_pop_front(&trap_list, struct cxip_trap, trap_obj, link); + free(trap_obj); + } +} + +void cxip_trap_set(int index, int trap, int err, int prov_errno) +{ + struct cxip_trap *trap_obj; + + if (!trap_initialized) { + dlist_init(&trap_list); + trap_initialized = true; + } + trap_obj = calloc(1, sizeof(*trap_obj)); + if (!trap_obj) { + return; + } + dlist_init(&trap_obj->link); + trap_obj->index = index; + trap_obj->trap = trap; + trap_obj->err = err; + trap_obj->prov_errno = prov_errno; + dlist_insert_tail(&trap_list, &trap_obj->link); +} + +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno) +{ + struct cxip_trap *trap_obj; + struct dlist_entry *item; + + if (!trap_initialized) { + return false; + } + + int cnt = 0; + dlist_foreach(&trap_list, item) { + cnt++; + trap_obj = container_of(item, struct cxip_trap, link); + if (trap_obj->index != index) + continue; + if (trap_obj->trap != trap) + continue; + dlist_remove(item); + if (err) + *err = trap_obj->err; + if (prov_errno) { + if (trap_obj->err == -FI_EAVAIL) + *prov_errno = trap_obj->prov_errno; + else + *prov_errno = 0; + } + free(trap_obj); + return true; + } + return false; +} #else void cxip_fault_inject_init(void) {} void cxip_fault_inject_fini(void) {} + +void cxip_trap_close(void) {} +void cxip_trap_set(int index, int trap, int err, int prov_errno) {} +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno) +{ + return false; +} #endif diff --git a/prov/cxi/src/cxip_if.c b/prov/cxi/src/cxip_if.c index 1d14aecf470..62ebb4f86b7 100644 --- a/prov/cxi/src/cxip_if.c +++ b/prov/cxi/src/cxip_if.c @@ -247,7 +247,7 @@ int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id, } lni->iface = iface; - ofi_spin_init(&lni->lock); + pthread_rwlock_init(&lni->cp_lock, NULL); dlist_init(&lni->remap_cps); CXIP_DBG("Allocated LNI, %s RGID: %u\n", diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 5c6e34ac1a1..76d1fa204e5 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019,2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019,2022-2024 Hewlett Packard Enterprise Development LP */ /* CXI fabric discovery implementation. */ @@ -249,8 +249,8 @@ struct fi_ep_attr cxip_ep_attr = { .protocol = FI_PROTO_CXI, .protocol_version = CXIP_WIRE_PROTO_VERSION, .max_msg_size = CXIP_EP_MAX_MSG_SZ, - .max_order_raw_size = -1, - .max_order_war_size = -1, + .max_order_raw_size = 0, + .max_order_war_size = 0, .max_order_waw_size = -1, .mem_tag_format = FI_TAG_GENERIC >> (64 - CXIP_TAG_WIDTH), .auth_key_size = sizeof(struct cxi_auth_key), @@ -386,13 +386,13 @@ struct util_prov cxip_util_prov = { .flags = 0, }; -int s_page_size; +int sc_page_size; /* Get _SC_PAGESIZE */ static void set_system_page_size(void) { - if (!s_page_size) - s_page_size = sysconf(_SC_PAGESIZE); + if (!sc_page_size) + sc_page_size = sysconf(_SC_PAGESIZE); } /* @@ -510,6 +510,7 @@ static int cxip_info_init(void) fi->tx_attr->inject_size = 0; fi->rx_attr->msg_order = CXIP_MSG_ORDER & ~FI_ORDER_SAS; fi->rx_attr->caps |= FI_DIRECTED_RECV; + fi->rx_attr->total_buffered_recv = 0; CXIP_DBG("%s RNR info created\n", nic_if->info->device_name); @@ -607,8 +608,6 @@ struct cxip_environment cxip_env = { .force_odp = false, .ats = false, .iotlb = true, - .disable_dmabuf_cuda = false, - .disable_dmabuf_rocr = false, .ats_mlock_mode = CXIP_ATS_MLOCK_ALL, .fork_safe_requested = false, .rx_match_mode = CXIP_PTLTE_DEFAULT_MODE, @@ -649,7 +648,6 @@ struct cxip_environment cxip_env = { .disable_eq_hugetlb = false, .zbcoll_radix = 2, .cq_fill_percent = 50, - .enable_unrestricted_end_ro = true, .rget_tc = FI_TC_UNSPEC, .cacheline_size = CXIP_DEFAULT_CACHE_LINE_SIZE, .coll_job_id = NULL, @@ -660,12 +658,19 @@ struct cxip_environment cxip_env = { .coll_fabric_mgr_url = NULL, .coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC, .coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC, + .coll_fm_timeout_msec = CXIP_COLL_DFL_FM_TIMEOUT_MSEC, .coll_use_dma_put = false, .telemetry_rgid = -1, .disable_hmem_dev_register = 0, .ze_hmem_supported = 0, .rdzv_proto = CXIP_RDZV_PROTO_DEFAULT, .enable_trig_op_limit = false, + .mr_cache_events_disable_poll_nsecs = + CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS, + .mr_cache_events_disable_le_poll_nsecs = + CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS, + .force_dev_reg_copy = false, + .mr_target_ordering = MR_ORDER_DEFAULT, }; static void cxip_env_init(void) @@ -738,11 +743,6 @@ static void cxip_env_init(void) fi_param_get_bool(&cxip_prov, "disable_host_register", &cxip_env.disable_host_register); - fi_param_define(&cxip_prov, "enable_unrestricted_end_ro", FI_PARAM_BOOL, - "Default: %d", cxip_env.enable_unrestricted_end_ro); - fi_param_get_bool(&cxip_prov, "enable_unrestricted_end_ro", - &cxip_env.enable_unrestricted_end_ro); - fi_param_define(&cxip_prov, "odp", FI_PARAM_BOOL, "Enables on-demand paging (default %d).", cxip_env.odp); fi_param_get_bool(&cxip_prov, "odp", &cxip_env.odp); @@ -764,17 +764,17 @@ static void cxip_env_init(void) "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); - fi_param_define(&cxip_prov, "disable_dmabuf_cuda", FI_PARAM_BOOL, - "Disables the DMABUF interface for CUDA (default %d).", - cxip_env.disable_dmabuf_cuda); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_cuda", - &cxip_env.disable_dmabuf_cuda); + /* Use ROCR DMABUF by default - honors the env if already set */ + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 0); + if (ret) + CXIP_INFO("Could not enable FI_HMEM_ROCR_USE_DMABUF ret:%d %s\n", + ret, fi_strerror(errno)); - fi_param_define(&cxip_prov, "disable_dmabuf_rocr", FI_PARAM_BOOL, - "Disables the DMABUF interface for ROCR (default %d).", - cxip_env.disable_dmabuf_rocr); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_rocr", - &cxip_env.disable_dmabuf_rocr); + /* Disable cuda DMABUF by default - honors the env if already set */ + ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "0", 0); + if (ret) + CXIP_INFO("Could not disable FI_HMEM_CUDA_USE_DMABUF ret:%d %s\n", + ret, fi_strerror(errno)); fi_param_define(&cxip_prov, "ats_mlock_mode", FI_PARAM_STRING, "Sets ATS mlock mode (off | all)."); @@ -828,27 +828,8 @@ static void cxip_env_init(void) fi_param_define(&cxip_prov, "rx_match_mode", FI_PARAM_STRING, "Sets RX message match mode (hardware | software | hybrid)."); - fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); - if (param_str) { - if (!strcasecmp(param_str, "hardware")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } else if (!strcmp(param_str, "software")) { - cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; - cxip_env.msg_offload = false; - } else if (!strcmp(param_str, "hybrid")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; - cxip_env.msg_offload = true; - } else { - CXIP_WARN("Unrecognized rx_match_mode: %s\n", - param_str); - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } - - param_str = NULL; - } + cxip_set_env_rx_match_mode(); fi_param_define(&cxip_prov, "rdzv_threshold", FI_PARAM_SIZE_T, "Message size threshold for rendezvous protocol."); @@ -1036,54 +1017,6 @@ static void cxip_env_init(void) fi_param_get_size_t(&cxip_prov, "req_buf_max_cached", &cxip_env.req_buf_max_cached); - /* Parameters to tailor hybrid hardware to software transitions - * that are initiated by software. - */ - fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive UX transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_preemptive", - &cxip_env.hybrid_preemptive); - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_preemptive) { - cxip_env.hybrid_preemptive = false; - CXIP_WARN("Not in hybrid mode, ignoring preemptive\n"); - } - - fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive recv transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", - &cxip_env.hybrid_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore LE recv preemptive\n"); - cxip_env.hybrid_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", - &cxip_env.hybrid_posted_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_posted_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); - cxip_env.hybrid_posted_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", - &cxip_env.hybrid_unexpected_msg_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_unexpected_msg_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); - cxip_env.hybrid_unexpected_msg_preemptive = 0; - } - if (cxip_software_pte_allowed()) { min_free = CXIP_REQ_BUF_HEADER_MAX_SIZE + cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; @@ -1246,6 +1179,17 @@ static void cxip_env_init(void) if (cxip_env.coll_timeout_usec > CXIP_COLL_MAX_TIMEOUT_USEC) cxip_env.coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC; + fi_param_define(&cxip_prov, "coll_fm_timeout_msec", FI_PARAM_SIZE_T, + "FM API timeout (msec) (default %d, min %d, max %d).", + cxip_env.coll_fm_timeout_msec, CXIP_COLL_MIN_FM_TIMEOUT_MSEC, + CXIP_COLL_MAX_FM_TIMEOUT_MSEC); + fi_param_get_size_t(&cxip_prov, "coll_fm_timeout_msec", + &cxip_env.coll_fm_timeout_msec); + if (cxip_env.coll_fm_timeout_msec < CXIP_COLL_MIN_FM_TIMEOUT_MSEC) + cxip_env.coll_fm_timeout_msec = CXIP_COLL_MIN_FM_TIMEOUT_MSEC; + if (cxip_env.coll_fm_timeout_msec > CXIP_COLL_MAX_FM_TIMEOUT_MSEC) + cxip_env.coll_fm_timeout_msec = CXIP_COLL_MAX_FM_TIMEOUT_MSEC; + fi_param_define(&cxip_prov, "default_tx_size", FI_PARAM_SIZE_T, "Default provider tx_attr.size (default: %lu).", cxip_env.default_tx_size); @@ -1328,6 +1272,42 @@ static void cxip_env_init(void) param_str = NULL; } + fi_param_define(&cxip_prov, "mr_cache_events_disable_poll_nsecs", FI_PARAM_SIZE_T, + "Max amount of time to poll when disabling an MR configured with MR match events (default: %lu).", + cxip_env.mr_cache_events_disable_poll_nsecs); + fi_param_get_size_t(&cxip_prov, "mr_cache_events_disable_poll_nsecs", + &cxip_env.mr_cache_events_disable_poll_nsecs); + + fi_param_define(&cxip_prov, "mr_cache_events_disable_le_poll_nsecs", FI_PARAM_SIZE_T, + "Max amount of time to poll when LE invalidate disabling an MR configured with MR match events (default: %lu).", + cxip_env.mr_cache_events_disable_le_poll_nsecs); + fi_param_get_size_t(&cxip_prov, "mr_cache_events_disable_le_poll_nsecs", + &cxip_env.mr_cache_events_disable_le_poll_nsecs); + + fi_param_define(&cxip_prov, "force_dev_reg_copy", FI_PARAM_BOOL, + "Force device register copy operations. Default: %d", + cxip_env.force_dev_reg_copy); + fi_param_get_bool(&cxip_prov, "force_dev_reg_copy", + &cxip_env.force_dev_reg_copy); + + fi_param_define(&cxip_prov, "mr_target_ordering", FI_PARAM_STRING, + "MR target ordering (i.e. PCI ordering). Options: default, strict, or relaxed. Recommendation is to leave at default behavior."); + fi_param_get_str(&cxip_prov, "mr_target_ordering", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "default")) + cxip_env.mr_target_ordering = MR_ORDER_DEFAULT; + else if (!strcmp(param_str, "strict")) + cxip_env.mr_target_ordering = MR_ORDER_STRICT; + else if (!strcmp(param_str, "relaxed")) + cxip_env.mr_target_ordering = MR_ORDER_RELAXED; + else + CXIP_WARN("Unrecognized mr_target_ordering: %s\n", + param_str); + + param_str = NULL; + } + set_system_page_size(); } diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index 14f4d955978..4723c311d97 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -28,12 +28,6 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, return -FI_ENOSYS; } - if (iface == FI_HMEM_CUDA && cxip_env.disable_dmabuf_cuda) - return FI_SUCCESS; - - if (iface == FI_HMEM_ROCR && cxip_env.disable_dmabuf_rocr) - return FI_SUCCESS; - ret = ofi_hmem_get_base_addr(iface, iov_base, len, (void*)&base, &size); if (ret) return ret; @@ -45,6 +39,10 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, hints->dmabuf_offset = offset; hints->dmabuf_valid = true; + /* Need to cache DMA buf FD to release later. */ + md->dmabuf_fd = dmabuf_fd; + md->dmabuf_fd_valid = true; + return FI_SUCCESS; } @@ -112,7 +110,18 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) CXIP_WARN(MAP_FAIL_MSG, dom->lni->lni->id, entry->info.iov.iov_base, entry->info.iov.iov_len, map_flags, ret, fi_strerror(-ret)); - goto err; + goto err_free_dmabuf; + } + + /* If the md len is larger than the iov_len, the VA and len have + * been aligned to a larger page size. Update the cache memory + * region registered by returning -FI_EAGAIN. Note, that GPU memory + * cannot be aligned since the aligned iov_base may fall outside the + * valid device address. + */ + if (entry->info.iface == FI_HMEM_SYSTEM) { + entry->info.iov.iov_base = (void *)md->md->va; + entry->info.iov.iov_len = md->md->len; } /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be @@ -156,6 +165,9 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) err_unmap: cxil_unmap(md->md); +err_free_dmabuf: + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd); err: md->dom = NULL; return ret; @@ -176,6 +188,9 @@ static void cxip_do_unmap(struct ofi_mr_cache *cache, if (md->handle_valid) ofi_hmem_dev_unregister(entry->info.iface, md->handle); + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd); + ret = cxil_unmap(md->md); if (ret) CXIP_WARN("cxil_unmap failed: %d\n", ret); @@ -421,7 +436,7 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, &uncached_md->md); if (ret) { CXIP_WARN("cxil_map failed: %d:%s\n", ret, fi_strerror(-ret)); - goto err_free_uncached_md; + goto err_free_dmabuf; } /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be @@ -461,8 +476,12 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, return FI_SUCCESS; + err_unmap: cxil_unmap(uncached_md->md); +err_free_dmabuf: + if (uncached_md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(attr->iface, uncached_md->dmabuf_fd); err_free_uncached_md: free(uncached_md); @@ -475,10 +494,15 @@ static void cxip_map_get_mem_region_size(const void *buf, unsigned long len, { int ret; - ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len); - if (ret) { + if (iface == FI_HMEM_SYSTEM) { *out_buf = (void *)buf; *out_len = len; + } else { + ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len); + if (ret) { + *out_buf = (void *)buf; + *out_len = len; + } } CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=0x%lx\n", @@ -565,6 +589,9 @@ static void cxip_unmap_nocache(struct cxip_md *md) { int ret; + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(md->info.iface, md->dmabuf_fd); + if (md->handle_valid) ofi_hmem_dev_unregister(md->info.iface, md->handle); diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 6d088e21262..7fbd6e6187a 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -198,6 +198,29 @@ static int cxip_mr_enable_std(struct cxip_mr *mr) return FI_SUCCESS; } +/* If MR event counts are recorded then we can check event counts to determine + * if invalidate can be skipped. + */ +static bool cxip_mr_disable_check_count_events(struct cxip_mr *mr, + uint64_t timeout) +{ + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + uint64_t end = ofi_gettime_ns() + timeout; + + while (true) { + + if (ofi_atomic_get32(&mr->match_events) == + ofi_atomic_get32(&mr->access_events)) + return true; + + if (ofi_gettime_ns() >= end) + return false; + + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } +} + /* * cxip_mr_disable_std() - Free HW resources from the standard MR. * @@ -207,35 +230,45 @@ static int cxip_mr_disable_std(struct cxip_mr *mr) { int ret; struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + bool count_events_disabled; /* TODO: Handle -FI_EAGAIN. */ ret = cxip_pte_unlink(ep_obj->ctrl.pte, C_PTL_LIST_PRIORITY, mr->req.req_id, ep_obj->ctrl.tgq); - assert(ret == FI_SUCCESS); + if (ret != FI_SUCCESS) + CXIP_FATAL("Unable to queue unlink command: %d\n", ret); do { sched_yield(); cxip_ep_tgt_ctrl_progress_locked(ep_obj); } while (mr->mr_state != CXIP_MR_UNLINKED); - /* If MR event counts are recorded then we can check event counts - * to determine if invalidate can be skipped. - */ - if (!mr->count_events || ofi_atomic_get32(&mr->match_events) != - ofi_atomic_get32(&mr->access_events)) { - /* TODO: Temporary debug helper for DAOS to track if - * Match events detect a need to flush. - */ - if (mr->count_events) - CXIP_WARN("Match events required pte LE invalidate\n"); + if (mr->count_events) { + count_events_disabled = cxip_mr_disable_check_count_events(mr, cxip_env.mr_cache_events_disable_poll_nsecs); + if (count_events_disabled) + goto disabled_success; - ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr->key, - C_PTL_LIST_PRIORITY); - if (ret) - CXIP_WARN("MR %p key 0x%016lX invalidate failed %d\n", - mr, mr->key, ret); + CXIP_WARN("Match events required pte LE invalidate: match_events=%u access_events=%u\n", + ofi_atomic_get32(&mr->match_events), + ofi_atomic_get32(&mr->access_events)); + } + + ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr->key, + C_PTL_LIST_PRIORITY); + if (ret) + CXIP_FATAL("MR %p key 0x%016lX invalidate failed %d\n", mr, + mr->key, ret); + + /* For LE invalidate and MR events, need to flush event queues until + * access equals match. + */ + if (mr->count_events) { + count_events_disabled = cxip_mr_disable_check_count_events(mr, cxip_env.mr_cache_events_disable_le_poll_nsecs); + if (!count_events_disabled) + CXIP_FATAL("Failed LE MR invalidation\n"); } +disabled_success: mr->enabled = false; CXIP_DBG("Standard MR disabled: %p (key: 0x%016lX)\n", mr, mr->key); @@ -281,7 +314,9 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) uint32_t le_flags; uint64_t ib = 0; int pid_idx; + bool target_relaxed_order; + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); mr->req.cb = cxip_mr_cb; ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl.tgt_evtq, @@ -307,15 +342,15 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) goto err_pte_free; } - ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, 0); + ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, + CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); goto err_pte_free; } - le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | - C_LE_UNRESTRICTED_BODY_RO; + le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE; if (mr->attr.access & FI_REMOTE_WRITE) le_flags |= C_LE_OP_PUT; if (mr->attr.access & FI_REMOTE_READ) @@ -323,15 +358,10 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) if (mr->cntr) le_flags |= C_LE_EVENT_CT_COMM; - /* When FI_FENCE is not requested, restricted operations can used PCIe - * relaxed ordering. Unrestricted operations PCIe relaxed ordering is - * controlled by an env for now. - */ - if (!(ep_obj->caps & FI_FENCE)) { + if (target_relaxed_order) { ib = 1; - - if (cxip_env.enable_unrestricted_end_ro) - le_flags |= C_LE_UNRESTRICTED_END_RO; + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; } ret = cxip_pte_append(mr->pte, @@ -442,7 +472,9 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) struct cxip_mr *_mr; uint32_t le_flags; uint64_t ib = 0; + bool target_relaxed_order; + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); mr_cache = &ep_obj->ctrl.opt_mr_cache[lac]; ofi_atomic_inc32(&mr_cache->ref); @@ -501,7 +533,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) } ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl.tgq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); @@ -509,17 +541,12 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) } le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | - C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_OP_GET; + C_LE_OP_PUT | C_LE_OP_GET; - /* When FI_FENCE is not requested, restricted operations can used PCIe - * relaxed ordering. Unrestricted operations PCIe relaxed ordering is - * controlled by an env for now. - */ - if (!(ep_obj->caps & FI_FENCE)) { + if (target_relaxed_order) { ib = 1; - - if (cxip_env.enable_unrestricted_end_ro) - le_flags |= C_LE_UNRESTRICTED_END_RO; + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; } ret = cxip_pte_append(_mr->pte, 0, -1ULL, lac, @@ -601,6 +628,9 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) union cxip_match_bits mb; union cxip_match_bits ib; uint32_t le_flags; + bool target_relaxed_order; + + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); /* TODO: Handle enabling for each bound endpoint */ mr_cache = &ep_obj->ctrl.std_mr_cache[lac]; @@ -643,8 +673,10 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) ib.mr_lac = 0; ib.mr_cached = 0; - le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_UNRESTRICTED_BODY_RO | - C_LE_OP_PUT | C_LE_OP_GET; + le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_OP_PUT | C_LE_OP_GET; + if (target_relaxed_order) + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; ret = cxip_pte_append(ep_obj->ctrl.pte, 0, -1ULL, mb.mr_lac, C_PTL_LIST_PRIORITY, @@ -725,6 +757,14 @@ static void cxip_mr_domain_remove(struct cxip_mr *mr) ofi_spin_unlock(&mr->domain->mr_domain.lock); } +static bool cxip_is_valid_mr_key(uint64_t key) +{ + if (key & ~CXIP_MR_KEY_MASK) + return false; + + return true; +} + /* * cxip_mr_domain_insert() - Validate uniqueness and insert * client key in the domain hash table. @@ -744,7 +784,7 @@ static int cxip_mr_domain_insert(struct cxip_mr *mr) mr->key = mr->attr.requested_key; - if (!cxip_generic_is_valid_mr_key(mr->key)) + if (!cxip_is_valid_mr_key(mr->key)) return -FI_EKEYREJECTED; bucket = fasthash64(&mr->key, sizeof(mr->key), 0) % @@ -818,14 +858,6 @@ static int cxip_prov_cache_init_mr_key(struct cxip_mr *mr, return FI_SUCCESS; } -static bool cxip_is_valid_mr_key(uint64_t key) -{ - if (key & ~CXIP_MR_KEY_MASK) - return false; - - return true; -} - static bool cxip_is_valid_prov_mr_key(uint64_t key) { struct cxip_mr_key cxip_key = { @@ -1250,6 +1282,15 @@ static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; } + /* Zero length MRs do not have MD. */ + if (mr->md && + ep->ep_obj->require_dev_reg_copy[mr->md->info.iface] && + !mr->md->handle_valid) { + CXIP_WARN("Cannot bind to endpoint without required dev reg support\n"); + ret = -FI_EOPNOTSUPP; + break; + } + mr->ep = ep; ofi_atomic_inc32(&ep->ep_obj->ref); break; @@ -1406,6 +1447,10 @@ static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr, _mr->mr_fid.key = _mr->key; if (_mr->len) { + /* Do not check whether cuda_api_permitted is set at this point, + * because the mr is not bound to an endpoint. Check instead in + * cxip_mr_bind(). + */ ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0, &_mr->md); if (ret) { diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 4d3830dc18f..a8309847802 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -23,26 +23,25 @@ /* * cxip_recv_req_src_addr() - Translate request source address to FI address. */ -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req) +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force) { - struct cxip_rxc *rxc = req->recv.rxc; - /* If the FI_SOURCE capability is enabled, convert the initiator's * address to an FI address to be reported in a CQ event. If * application AVs are symmetric, the match_id in the EQ event is * logical and translation is not needed. Otherwise, translate the * physical address in the EQ event to logical FI address. */ - if (rxc->attr.caps & FI_SOURCE) { + if ((rxc->attr.caps & FI_SOURCE) || force) { struct cxip_addr addr = {}; if (rxc->ep_obj->av->symmetric) - return CXI_MATCH_ID_EP(rxc->pid_bits, - req->recv.initiator); + return CXI_MATCH_ID_EP(rxc->pid_bits, init); - addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator); - addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, req->recv.initiator); - addr.vni = req->recv.vni; + addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + addr.vni = vni; return cxip_av_lookup_fi_addr(rxc->ep_obj->av, &addr); } @@ -61,7 +60,6 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, int (*recv_cb)(struct cxip_req *req, const union c_event *event)) { - struct cxip_domain *dom = rxc->domain; struct cxip_req *req; struct cxip_md *recv_md = NULL; int ret; @@ -80,7 +78,8 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, if (len) { /* If hybrid descriptor not passed, map for dma */ if (!md) { - ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); + ret = cxip_ep_obj_map(rxc->ep_obj, (void *)buf, len, 0, + &recv_md); if (ret) { RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n", @@ -104,7 +103,7 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, dlist_init(&req->recv.children); dlist_init(&req->recv.rxc_entry); - ofi_atomic_inc32(&rxc->orx_reqs); + cxip_rxc_orx_reqs_inc(rxc); *cxip_req = req; return FI_SUCCESS; @@ -118,16 +117,20 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, void cxip_recv_req_free(struct cxip_req *req) { struct cxip_rxc *rxc = req->recv.rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(rxc); assert(req->type == CXIP_REQ_RECV); assert(dlist_empty(&req->recv.children)); assert(dlist_empty(&req->recv.rxc_entry)); - ofi_atomic_dec32(&rxc->orx_reqs); + cxip_rxc_orx_reqs_dec(rxc); if (req->recv.recv_md && !req->recv.hybrid_md) cxip_unmap(req->recv.recv_md); + if (owner_srx && req->rx_entry) + owner_srx->owner_ops->free_entry(req->rx_entry); + cxip_evtq_req_free(req); } @@ -150,7 +153,8 @@ static inline int recv_req_event_success(struct cxip_rxc *rxc, } if (req->recv.rxc->attr.caps & FI_SOURCE) { - src_addr = cxip_recv_req_src_addr(req); + src_addr = cxip_recv_req_src_addr(req->recv.rxc, req->recv.initiator, + req->recv.vni, false); if (src_addr != FI_ADDR_NOTAVAIL || !(rxc->attr.caps & FI_SOURCE_ERR)) return cxip_cq_req_complete_addr(req, src_addr); @@ -217,7 +221,12 @@ void cxip_recv_req_report(struct cxip_req *req) parent->recv.mrecv_bytes == parent->recv.mrecv_unlink_bytes) unlinked = true; } else { - if ((parent->recv.ulen - parent->recv.mrecv_bytes) < rxc->min_multi_recv) + parent->recv.multirecv_inflight--; + assert(parent->recv.multirecv_inflight >= 0); + + if (!parent->recv.multirecv_inflight && + ((parent->recv.ulen - parent->recv.mrecv_bytes) < + rxc->min_multi_recv)) unlinked = true; } @@ -314,6 +323,9 @@ struct cxip_req *cxip_mrecv_req_dup(struct cxip_req *mrecv_req) /* Update fields specific to this Send */ req->recv.parent = mrecv_req; + /* Parent keeps track of operations in flight */ + mrecv_req->recv.multirecv_inflight++; + /* Start pointer and data_len must be set elsewhere! */ return req; @@ -460,7 +472,7 @@ int cxip_recv_cancel(struct cxip_req *req) /* In hybrid mode requests could be on priority list * or software receive list. */ - if (req->recv.software_list) { + if (!req->recv.hw_offloaded) { dlist_remove_init(&req->recv.rxc_entry); req->recv.canceled = true; req->recv.unlinked = true; @@ -526,7 +538,7 @@ int cxip_flush_appends(struct cxip_rxc_hpc *rxc, ret = -FI_EAGAIN; goto err; } - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); rxc->base.rx_evtq.ack_batch_size = 1; @@ -553,7 +565,7 @@ int cxip_flush_appends(struct cxip_rxc_hpc *rxc, return FI_SUCCESS; err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); err: return ret; @@ -575,6 +587,7 @@ int cxip_recv_req_dropped(struct cxip_req *req) assert(rxc->base.protocol == FI_PROTO_CXI); assert(dlist_empty(&req->recv.rxc_entry)); + req->recv.hw_offloaded = false; dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue); RXC_DBG(rxc, "Receive dropped: %p\n", req); @@ -705,8 +718,8 @@ int cxip_send_buf_init(struct cxip_req *req) /* Triggered operation always requires memory registration. */ if (req->triggered) - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, + req->send.len, 0, &req->send.send_md); /* FI_INJECT operations always require an internal bounce buffer. This * is needed to replay FI_INJECT operations which may experience flow @@ -764,8 +777,8 @@ int cxip_send_buf_init(struct cxip_req *req) } /* Everything else requires memory registeration. */ - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, req->send.len, 0, + &req->send.send_md); err_buf_fini: cxip_send_buf_fini(req); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 5d68d40c51a..faf2b52b9fc 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -629,8 +629,9 @@ static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, /* Copy data out of overflow buffer. */ oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); - cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, - oflow_va, oflow_bytes); + cxip_ep_obj_copy_to_md(match_req->recv.rxc->ep_obj, + match_req->recv.recv_md, + match_req->recv.recv_buf, oflow_va, oflow_bytes); if (oflow_req->type == CXIP_REQ_OFLOW) oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); @@ -1066,7 +1067,7 @@ int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event) */ cxip_report_send_completion(put_req, true); - ofi_atomic_dec32(&put_req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(put_req->send.txc); cxip_evtq_req_free(put_req); return FI_SUCCESS; @@ -1337,12 +1338,13 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) if (req->recv.multi_recv && !req->recv.rdzv_events) { dlist_remove(&req->recv.children); + req->recv.parent->recv.multirecv_inflight--; cxip_evtq_req_free(req); } return -FI_EAGAIN; } - RXC_DBG(rxc, "Software issued Get, req: %p\n", req); + RXC_DBG(rxc, "Software issued RGet, req: %p\n", req); } /* Count the rendezvous event. */ @@ -1357,17 +1359,22 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) } /* If a rendezvous operation requires a done notification - * send it. Must wait for the ACK from the notify to be returned - * before completing the target operation. + * it was initiated by software. Re-use the existing + * rendezvous get TX credit. Need to wait for the ACK from + * the done notify to be returned before releasing the + * TX credit and completing the target operation. */ - if (req->recv.done_notify) { - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > - rxc->base.max_tx || cxip_rdzv_done_notify(req)) { + if (req->recv.done_notify && cxip_rdzv_done_notify(req)) + return -FI_EAGAIN; - /* Could not issue notify, will be retried */ - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } + /* If RGet initiated by software return the TX credit unless + * it will be used for sending an alt_read done_notify message. + */ + if (!event->init_short.rendezvous && + !req->recv.done_notify) { + ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); + assert(ofi_atomic_get32(&req->recv.rxc_hpc->orx_tx_reqs) + >= 0); } /* Rendezvous Get completed, update event counts and @@ -1376,13 +1383,6 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) req->recv.rc = cxi_init_event_rc(event); rdzv_recv_req_event(req, event->hdr.event_type); - /* If RGet initiated by software return the TX credit */ - if (!event->init_short.rendezvous) { - ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); - assert(ofi_atomic_get32(&req->recv.rxc_hpc->orx_tx_reqs) - >= 0); - } - return FI_SUCCESS; case C_EVENT_ACK: @@ -1394,7 +1394,7 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) /* Special case of the ZBP destination EQ being full and ZBP * could not complete. This must be retried, we use the TX - * credit already allocated. + * credit already allocated for the done notify. */ if (event_rc == C_RC_ENTRY_NOT_FOUND) { usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US); @@ -2059,14 +2059,14 @@ static void cxip_ux_onload_complete(struct cxip_req *req) rxc->sw_pending_ux_list_len = 0; RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", - rxc->sw_ux_list_len); + rxc->sw_ux_list_len); if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) cxip_post_ux_onload_sw(rxc); else cxip_post_ux_onload_fc(rxc); - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); } @@ -2126,6 +2126,7 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) struct cxip_deferred_event *def_ev; struct cxip_ux_send *ux_send; bool matched; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || @@ -2180,8 +2181,13 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) } rxc->cur_ule_offsets++; - dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); - rxc->sw_ux_list_len++; + /* TODO: support onloading in peer mode */ + if (owner_srx) { + RXC_FATAL(rxc, "Software onloading is currently not supported in peer mode\n"); + } else { + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + } RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); @@ -2253,7 +2259,7 @@ static int cxip_ux_onload(struct cxip_rxc_hpc *rxc) ret = -FI_EAGAIN; goto err_free_onload_offset; } - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); req->cb = cxip_ux_onload_cb; req->type = CXIP_REQ_SEARCH; @@ -2279,7 +2285,7 @@ static int cxip_ux_onload(struct cxip_rxc_hpc *rxc) return FI_SUCCESS; err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); err_free_onload_offset: free(rxc->ule_offsets); @@ -2304,7 +2310,7 @@ static int cxip_flush_appends_cb(struct cxip_req *req, ret = cxip_ux_onload(rxc); if (ret == FI_SUCCESS) { - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); } @@ -3034,7 +3040,9 @@ static void cxip_set_ux_dump_entry(struct cxip_req *req, } if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) - *src_addr = cxip_recv_req_src_addr(req); + *src_addr = cxip_recv_req_src_addr(req->recv.rxc, + req->recv.initiator, + req->recv.vni, false); } } @@ -3179,19 +3187,23 @@ static int cxip_recv_sw_matched(struct cxip_req *req, /* Make sure we can issue the RGet; if not we stall * and TX event queue progress will free up credits. */ - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->base.max_tx) { - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } + do { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) <= + rxc->base.max_tx) + break; - ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, - mrecv_start, mrecv_len, req_done); - if (ret != FI_SUCCESS) { - req->recv.start_offset -= mrecv_len; ofi_atomic_dec32(&rxc->orx_tx_reqs); + cxip_evtq_progress(&rxc->base.ep_obj->txc->tx_evtq); + } while (true); - return ret; - } + do { + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + if (ret == FI_SUCCESS) + break; + + cxip_evtq_progress(&rxc->base.ep_obj->txc->tx_evtq); + } while (true); /* If multi-recv, a child request was created from * cxip_ux_send(). Need to lookup this request. @@ -3243,7 +3255,7 @@ static int cxip_recv_sw_matched(struct cxip_req *req, if (ret != FI_SUCCESS) { /* undo mrecv_req_put_bytes() */ - req->recv.start_offset -= mrecv_len; + req->recv.start_offset = mrecv_start; return ret; } } @@ -3314,6 +3326,192 @@ static int cxip_recv_sw_matcher(struct cxip_rxc_hpc *rxc, struct cxip_req *req, return ret; } +static int +cxip_recv_req_init(struct cxip_rxc *rxc, void *buf, size_t len, fi_addr_t addr, + uint64_t tag, uint64_t ignore, uint64_t flags, bool tagged, + void *context, struct cxip_cntr *comp_cntr, + struct cxip_req **req_out) +{ + struct cxip_req *req; + uint32_t match_id; + int ret; + uint16_t vni; + + if (len && !buf) { + ret = -FI_EINVAL; + goto err; + } + + if (rxc->state == RXC_DISABLED) { + ret = -FI_EOPBADSTATE; + goto err; + } + + /* HW to SW PtlTE transition, ensure progress is made */ + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(rxc->recv_cq); + ret = -FI_EAGAIN; + goto err; + } + + if (tagged) { + if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_TAG_MASK); + ret = -FI_EINVAL; + goto err; + } + flags &= ~FI_MULTI_RECV; + } + + ret = cxip_set_recv_match_id(rxc, addr, rxc->ep_obj->av_auth_key && + (flags & FI_AUTH_KEY), &match_id, &vni); + if (ret) { + RXC_WARN(rxc, "Error setting match_id: %d %s\n", + ret, fi_strerror(-ret)); + goto err; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + ofi_genlock_unlock(&rxc->ep_obj->lock); + if (ret) + return ret; + + /* req->data_len, req->tag, req->data must be set later. req->buf may + * be overwritten later. + */ + req->context = (uint64_t)context; + + req->flags = FI_RECV | (flags & FI_COMPLETION); + if (tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + + req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + req->recv.match_id = match_id; + req->recv.tag = tag; + req->recv.ignore = ignore; + req->recv.flags = flags; + req->recv.tagged = tagged; + req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + *req_out = req; + + return FI_SUCCESS; + +err: + return ret; +} + +int cxip_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + int ret; + struct cxip_ux_send *ux; + union cxip_match_bits ux_mb; + struct cxip_req *req; + struct cxip_rxc *rxc; + + ux = rx_entry->peer_context; + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + rxc = ux->rxc; + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, 0, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return ret; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p\n", req, ux); + + return ret; +} + +static int cxip_process_srx_ux_matcher(struct cxip_rxc *rxc, + struct fid_peer_srx *owner_srx, struct cxip_ux_send *ux) +{ + int ret; + uint32_t ux_init; + union cxip_match_bits ux_mb; + struct fi_peer_rx_entry *rx_entry = NULL; + struct cxip_req *req; + uint16_t vni; + struct fi_peer_match_attr match = {0}; + + /* stash the rxc because we're going to need it if the peer + * address isn't already inserted into the AV table. + */ + ux->rxc = rxc; + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + match.addr = cxip_recv_req_src_addr(rxc, ux_init, vni, true); + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) { + match.tag = ux_mb.tag; + ret = owner_srx->owner_ops->get_tag(owner_srx, &match, &rx_entry); + } else { + ret = owner_srx->owner_ops->get_msg(owner_srx, &match, &rx_entry); + } + + /* return it back to the caller */ + ux->rx_entry = rx_entry; + + if (ret == -FI_ENOENT) { + /* this is used when the owner calls start_msg */ + rx_entry->peer_context = ux; + return -FI_ENOMSG; + } else if (ret) { + return ret; + } + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, 0, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p\n", req, ux); + + return ret; +} + /* * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user * posted receive. @@ -3324,10 +3522,17 @@ int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) { struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; struct cxip_rxc_hpc *rxc = rbuf->rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); struct cxip_req *req; struct dlist_entry *tmp; int ret; + if (owner_srx) { + /* we never add anything on the sw_ux_list */ + rxc->sw_ux_list_len--; + return cxip_process_srx_ux_matcher(&rxc->base, owner_srx, ux); + } + if (dlist_empty(&rxc->sw_recv_queue)) return -FI_ENOMSG; @@ -3464,8 +3669,7 @@ static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq) if (ret) goto err_dequeue_req; } else { - - req->recv.software_list = true; + req->recv.hw_offloaded = false; dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue); } @@ -3726,7 +3930,8 @@ static int cxip_rxc_hpc_msg_init(struct cxip_rxc *rxc_base) } /* Start accepting Puts. */ - ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, state, 0); + ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, state, + CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); goto free_oflow_buf; @@ -3866,7 +4071,7 @@ static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc) if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && cxip_env.hybrid_posted_recv_preemptive == 1) { - count = ofi_atomic_get32(&rxc->orx_reqs); + count = cxip_rxc_orx_reqs_get(rxc); if (count > rxc->attr.size) { assert(rxc->state == RXC_ENABLED); @@ -3985,71 +4190,16 @@ cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, int ret; struct cxip_req *req; struct cxip_ux_send *ux_msg; - uint32_t match_id; - uint16_t vni; assert(rxc_hpc->base.protocol == FI_PROTO_CXI); - if (len && !buf) - return -FI_EINVAL; - - if (rxc->state == RXC_DISABLED) - return -FI_EOPBADSTATE; - - /* HW to SW PtlTE transition, ensure progress is made */ - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - cxip_cq_progress(rxc->recv_cq); - return -FI_EAGAIN; - } - - if (tagged) { - if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { - RXC_WARN(rxc, - "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", - tag, ignore, CXIP_TAG_MASK); - return -FI_EINVAL; - } - } - - ret = cxip_set_recv_match_id(rxc, src_addr, rxc->ep_obj->av_auth_key && - (flags & FI_AUTH_KEY), &match_id, &vni); - if (ret) { - RXC_WARN(rxc, "Error setting match_id: %d %s\n", - ret, fi_strerror(-ret)); - return ret; - } - - ofi_genlock_lock(&rxc->ep_obj->lock); - ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + ret = cxip_recv_req_init(rxc, buf, len, src_addr, tag, ignore, flags, + tagged, context, comp_cntr, &req); if (ret) goto err; - /* req->data_len, req->tag, req->data must be set later. req->buf may - * be overwritten later. - */ - req->context = (uint64_t)context; - - req->flags = FI_RECV | (flags & FI_COMPLETION); - if (tagged) - req->flags |= FI_TAGGED; - else - req->flags |= FI_MSG; - - req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; - req->recv.match_id = match_id; - req->recv.tag = tag; - req->recv.ignore = ignore; - req->recv.flags = flags; - req->recv.tagged = tagged; - req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); - - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - ret = -FI_EAGAIN; - goto err_free_request; - } - + ofi_genlock_lock(&rxc->ep_obj->lock); if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { - ret = cxip_recv_req_queue(req, false); /* Match made in software? */ if (ret == -FI_EALREADY) { @@ -4108,9 +4258,8 @@ cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, err_free_request: cxip_recv_req_free(req); -err: ofi_genlock_unlock(&rxc->ep_obj->lock); - +err: return ret; } @@ -4125,7 +4274,7 @@ static void rdzv_send_req_complete(struct cxip_req *req) cxip_report_send_completion(req, true); - ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->send.txc); cxip_evtq_req_free(req); } @@ -4460,7 +4609,7 @@ static int cxip_send_eager_cb(struct cxip_req *req, /* If MATCH_COMPLETE was requested, software must manage counters. */ cxip_report_send_completion(req, match_complete); - ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->send.txc); cxip_evtq_req_free(req); return FI_SUCCESS; @@ -4885,7 +5034,7 @@ int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid, * a TXC credit for replay. _cxip_send_req() will take the * credit again. */ - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); /* -FI_EAGAIN can be return if the command queue is full. Loop * until this goes through. @@ -5159,7 +5308,7 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, } /* Restrict outstanding success event requests to queue size */ - if (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size) { + if (cxip_txc_otx_reqs_get(txc) >= txc->attr.size) { ret = -FI_EAGAIN; goto err_req_free; } diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c index b5ae1410e7d..434968ecd92 100644 --- a/prov/cxi/src/cxip_msg_rnr.c +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -339,7 +339,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) dlist_init(&req->recv.rxc_entry); /* Selective does not count toward outstanding RX operations */ - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); ret = cxip_recv_req_alloc(&rxc->base, NULL, 0, NULL, &req, cxip_rnr_recv_selective_comp_cb); @@ -359,7 +359,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) dlist_init(&req->recv.rxc_entry); /* Selective does not count toward outstanding RX operations */ - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); rxc->hybrid_mr_desc = true; } @@ -382,7 +382,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) /* Start accepting Puts. */ ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); goto free_pte; @@ -400,12 +400,12 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) cxip_pte_free(rxc->base.rx_pte); free_req_tag: if (rxc->req_selective_comp_tag) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_tag); } free_req_msg: if (rxc->req_selective_comp_msg) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_msg); } @@ -423,11 +423,11 @@ static int cxip_rxc_rnr_msg_fini(struct cxip_rxc *rxc_base) * back before freeing. */ if (rxc->req_selective_comp_msg) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_msg); } if (rxc->req_selective_comp_tag) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_tag); } @@ -827,7 +827,7 @@ static int cxip_process_rnr_time_wait(struct cxip_txc_rnr *txc) ofi_atomic_dec32(&txc->time_wait_reqs); cxip_send_buf_fini(req); cxip_report_send_completion(req, true); - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); cxip_evtq_req_free(req); continue; @@ -836,10 +836,10 @@ static int cxip_process_rnr_time_wait(struct cxip_txc_rnr *txc) /* Must TX return credit, will take it back if * we could not send. */ - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); ret = cxip_rnr_msg_send(req); if (ret != FI_SUCCESS) { - ofi_atomic_inc32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_inc(&txc->base); goto reset_min_time_wait; } @@ -1031,7 +1031,7 @@ static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event) req->send.caddr.nic, req->send.caddr.pid, req->send.tagged ? '*' : '-', req->send.tag, req->send.retries, - ofi_atomic_get32(&txc->base.otx_reqs)); + cxip_txc_otx_reqs_get(&txc->base)); } cxip_rnr_send_req_dequeue(req); @@ -1054,7 +1054,7 @@ static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event) cxip_report_send_completion(req, req->send.canceled); - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); cxip_evtq_req_free(req); return FI_SUCCESS; @@ -1147,7 +1147,7 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, } /* Restrict outstanding success event requests to queue size */ - if (ofi_atomic_get32(&txc->otx_reqs) > txc->attr.size) { + if (cxip_txc_otx_reqs_get(txc) > txc->attr.size) { ret = -FI_EAGAIN; goto free_req; } @@ -1174,9 +1174,9 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, if (send_req->send.len && !idc) { if (!mr) { - ret = cxip_map(txc->domain, send_req->send.buf, - send_req->send.len, 0, - &send_req->send.send_md); + ret = cxip_ep_obj_map(txc->ep_obj, send_req->send.buf, + send_req->send.len, 0, + &send_req->send.send_md); if (ret) { TXC_WARN(txc, "Local buffer map failed: %d %s\n", diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c index b8ee08a3733..a313ccf0be4 100644 --- a/prov/cxi/src/cxip_ptelist_buf.c +++ b/prov/cxi/src/cxip_ptelist_buf.c @@ -132,8 +132,8 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) } } - ret = cxip_map(rxc->base.domain, buf->data, pool->attr.buf_size, - OFI_MR_NOCACHE, &buf->md); + ret = cxip_ep_obj_map(rxc->base.ep_obj, buf->data, pool->attr.buf_size, + OFI_MR_NOCACHE, &buf->md); if (ret) goto err_unreg_buf; diff --git a/prov/cxi/src/cxip_rdzv_pte.c b/prov/cxi/src/cxip_rdzv_pte.c index d99bda07f5c..ab2af82230f 100644 --- a/prov/cxi/src/cxip_rdzv_pte.c +++ b/prov/cxi/src/cxip_rdzv_pte.c @@ -265,7 +265,7 @@ static int cxip_rdzv_base_pte_alloc(struct cxip_txc_hpc *txc, /* Set to enable, event will be processed on link */ ret = cxip_pte_set_state(base_pte->pte, txc->rx_cmdq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to enqueue enable command: %d:%s\n", ret, fi_strerror(-ret)); diff --git a/prov/cxi/src/cxip_repsum.c b/prov/cxi/src/cxip_repsum.c index 6c0f5c93186..56ffe342a41 100644 --- a/prov/cxi/src/cxip_repsum.c +++ b/prov/cxi/src/cxip_repsum.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2022-2024 Cray Inc. All rights reserved. */ /* Notes: @@ -123,7 +123,7 @@ void cxip_dbl_to_rep(struct cxip_repsum *x, double d) // Subnormal values, zero x->M = BIN(1); w = OFF(1); - } + } /** * Copy the mantissa into the correct locations within T[]. diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c index 4a4624c59b7..09b1432b214 100644 --- a/prov/cxi/src/cxip_req_buf.c +++ b/prov/cxi/src/cxip_req_buf.c @@ -150,10 +150,22 @@ static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, "rbuf=%p ux=%p sw_pending_ux_list_len=%u\n", buf, ux, buf->rxc->sw_pending_ux_list_len); } else { - dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); - RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", - buf, ux, buf->rxc->sw_ux_list_len); + if (owner_srx) { + union cxip_match_bits ux_mb; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) + owner_srx->owner_ops->queue_tag(ux->rx_entry); + else + owner_srx->owner_ops->queue_msg(ux->rx_entry); + } else { + dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", + buf, ux, buf->rxc->sw_ux_list_len); + } } break; diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 9c36addddd3..660c29862de 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -150,12 +150,46 @@ static int cxip_rma_cb(struct cxip_req *req, const union c_event *event) TXC_WARN(txc, "Failed to report error: %d\n", ret); } - ofi_atomic_dec32(&req->rma.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->rma.txc); cxip_evtq_req_free(req); return FI_SUCCESS; } +static bool cxip_rma_emit_dma_need_req(size_t len, uint64_t flags, + struct cxip_mr *mr) +{ + /* DMA commands with FI_INJECT always require a request structure to + * track the bounce buffer. + */ + if (len && (flags & FI_INJECT)) + return true; + + /* If user request FI_COMPLETION, need request structure to return + * user context back. + * + * TODO: This can be optimized for zero byte operations. Specifically, + * The user context can be associated with the DMA command. But, this + * requires reworking on event queue processing to support. + */ + if (flags & FI_COMPLETION) + return true; + + /* If the user has provider their own MR, internal memory registration + * is not needed. Thus, no request structure is needed. + */ + if (mr) + return false; + + /* In the initiator buffer length is zero, no memory registration is + * needed. Thus, no request structure is needed. + */ + if (!len) + return false; + + return true; +} + static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, struct cxip_mr *mr, union c_fab_addr *dfa, uint8_t *idx_ext, uint16_t vni, uint64_t addr, @@ -169,7 +203,7 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, { struct cxip_req *req = NULL; struct cxip_md *dma_md = NULL; - void *dma_buf; + void *dma_buf = NULL; struct c_full_dma_cmd dma_cmd = {}; int ret; struct cxip_domain *dom = txc->domain; @@ -180,12 +214,7 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, if (!dom->hybrid_mr_desc) mr = NULL; - /* DMA commands always require a request structure regardless if - * FI_COMPLETION is set. This is due to the provider doing internally - * memory registration and having to clean up the registration on DMA - * operation completion. - */ - if ((len && (flags & FI_INJECT)) || (flags & FI_COMPLETION) || !mr) { + if (cxip_rma_emit_dma_need_req(len, flags, mr)) { req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); if (!req) { ret = -FI_EAGAIN; @@ -240,7 +269,8 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, } else { assert(req != NULL); - ret = cxip_map(dom, buf, len, 0, &req->rma.local_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, len, 0, + &req->rma.local_md); if (ret) { TXC_WARN(txc, "Failed to map buffer: %d:%s\n", ret, fi_strerror(-ret)); @@ -503,10 +533,6 @@ static bool cxip_rma_is_idc(struct cxip_txc *txc, uint64_t key, size_t len, { size_t max_idc_size = unr ? CXIP_INJECT_SIZE : C_MAX_IDC_PAYLOAD_RES; - /* DISABLE_NON_INJECT_MSG_IDC disables the IDC - */ - if (cxip_env.disable_non_inject_msg_idc) - return false; /* IDC commands are not supported for unoptimized MR since the IDC * small message format does not support remote offset which is needed * for RMA commands. @@ -608,7 +634,7 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, /* Select the correct traffic class type within a traffic class. */ if (!unr && (flags & FI_CXI_HRP)) tc_type = CXI_TC_TYPE_HRP; - else if (!unr) + else if (!unr && !triggered) tc_type = CXI_TC_TYPE_RESTRICTED; else tc_type = CXI_TC_TYPE_DEFAULT; @@ -633,12 +659,14 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, if (ret) TXC_WARN(txc, - "%s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + "%s %s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + unr ? "Ordered" : "Un-ordered", idc ? "IDC" : "DMA", write ? "write" : "read", buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); else TXC_DBG(txc, - "%s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + "%s %s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + unr ? "Ordered" : "Un-ordered", idc ? "IDC" : "DMA", write ? "write" : "read", buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index cdcaed39a59..8051ccdcade 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -127,7 +127,8 @@ static int rxc_msg_init(struct cxip_rxc *rxc) /* Base message initialization */ num_events = cxip_rxc_get_num_events(rxc); - ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1); + ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1, + rxc->ep_obj->priv_wait); if (ret) { CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n", ret, fi_strerror(-ret)); @@ -227,7 +228,7 @@ void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc) uint64_t start; int canceled = 0; - if (!ofi_atomic_get32(&rxc->orx_reqs)) + if (!cxip_rxc_orx_reqs_get(rxc)) return; cxip_evtq_req_discard(&rxc->rx_evtq, rxc); @@ -242,7 +243,7 @@ void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc) CXIP_DBG("Canceled %d Receives: %p\n", canceled, rxc); start = ofi_gettime_ms(); - while (ofi_atomic_get32(&rxc->orx_reqs)) { + while (cxip_rxc_orx_reqs_get(rxc)) { sched_yield(); cxip_evtq_progress(&rxc->rx_evtq); @@ -402,6 +403,13 @@ struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) { struct cxip_rxc *rxc = NULL; + /* + * It's possible the owner provider decides to turn off + * hardware offload in cxi. If that happens we need to update the + * rx_match_mode. + */ + cxip_set_env_rx_match_mode(); + switch (ep_obj->protocol) { case FI_PROTO_CXI: rxc = calloc(1, sizeof(struct cxip_rxc_hpc)); @@ -436,7 +444,7 @@ struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) rxc->attr = ep_obj->rx_attr; rxc->hmem = !!(rxc->attr.caps & FI_HMEM); rxc->pid_bits = ep_obj->domain->iface->dev->info.pid_bits; - ofi_atomic_initialize32(&rxc->orx_reqs, 0); + cxip_rxc_orx_reqs_init(rxc); rxc->sw_ep_only = cxip_env.rx_match_mode == CXIP_PTLTE_SOFTWARE_MODE; diff --git a/prov/cxi/src/cxip_telemetry.c b/prov/cxi/src/cxip_telemetry.c index 3bbdb6f48c5..d2b7ecffbfd 100644 --- a/prov/cxi/src/cxip_telemetry.c +++ b/prov/cxi/src/cxip_telemetry.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2022,2024 Hewlett Packard Enterprise Development LP * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ #include "config.h" @@ -112,8 +112,10 @@ static bool cxip_telemetry_entry_validate_token(struct cxip_telemetry *telemetry, const char *telemetry_token) { - /* The telemetry directory has an ALL-in-binary entry. This file is - * considered invalid for this telemetry implementation. + /* + * Cassini NextGen-Telemetry no longer provides 'ALL-in-binary'. + * Keeping this simple logic in place temporarily allows this + * logic to run with older versions of the driver. */ if (strcmp(telemetry_token, "ALL-in-binary") == 0) return false; @@ -165,54 +167,9 @@ static int cxip_telemetry_entry_alloc(struct cxip_telemetry *telemetry, return ret; } -static int cxip_telemetry_sleep_duration(void) -{ - int ret; - int msec_sleep; - char *path = "/sys/module/cxi_core/parameters/cntr_refresh_interval"; - FILE *f; - - f = fopen(path, "r"); - if (!f) - return -errno; - - ret = fscanf(f, "%d", &msec_sleep); - if (ret != 1) { - if (ret == EOF) - ret = -errno; - else - ret = -FI_EINVAL; - } else { - /* Convert sleep duration to seconds. */ - ret = msec_sleep / 1000; - if (msec_sleep % 1000) - ret++; - ret = MAX(ret, 1); - } - - fclose(f); - - return ret; -} - void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry) { struct cxip_telemetry_entry *entry; - int sleep_duration; - - /* Since sysfs telemetry entries are refreshed as some interval, we need - * to sleep for a refresh interval to get updates. Else, the application - * could run and telemetry deltas would be zero. - */ - sleep_duration = cxip_telemetry_sleep_duration(); - if (sleep_duration < 0) { - DOM_WARN(telemetry->dom, - "Failed to retrieve telemetry sleep duration: %d:%s\n", - sleep_duration, fi_strerror(-sleep_duration)); - return; - } - - sleep(sleep_duration); dlist_foreach_container(&telemetry->telemetry_list, struct cxip_telemetry_entry, entry, diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index 24564a5ef72..49d19fd6b58 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -63,8 +63,8 @@ int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region) struct cxip_md *md; int ret; - ret = cxip_map(txc->domain, region->mem_region, - region->pool->region_size, OFI_MR_NOCACHE, &md); + ret = cxip_ep_obj_map(txc->ep_obj, region->mem_region, + region->pool->region_size, OFI_MR_NOCACHE, &md); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to map inject buffer chunk\n"); return ret; @@ -295,6 +295,9 @@ static size_t cxip_txc_get_num_events(struct cxip_txc *txc) /* Account for internal operations. */ num_events += CXIP_INTERNAL_TX_REQS; + /* ACK batching */ + num_events += cxip_env.eq_ack_batch_size; + return num_events; } @@ -325,7 +328,8 @@ int cxip_txc_enable(struct cxip_txc *txc) num_events = cxip_txc_get_num_events(txc); - ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0); + ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0, + txc->ep_obj->priv_wait); if (ret) { CXIP_WARN("Failed to initialize TX event queue: %d, %s\n", ret, fi_strerror(-ret)); @@ -375,13 +379,13 @@ static void txc_cleanup(struct cxip_txc *txc) { uint64_t start; - if (!ofi_atomic_get32(&txc->otx_reqs)) + if (!cxip_txc_otx_reqs_get(txc)) goto proto_cleanup; cxip_evtq_req_discard(&txc->tx_evtq, txc); start = ofi_gettime_ms(); - while (ofi_atomic_get32(&txc->otx_reqs)) { + while (cxip_txc_otx_reqs_get(txc)) { sched_yield(); cxip_evtq_progress(&txc->tx_evtq); @@ -393,7 +397,7 @@ static void txc_cleanup(struct cxip_txc *txc) } } - assert(ofi_atomic_get32(&txc->otx_reqs) == 0); + assert(cxip_txc_otx_reqs_get(txc) == 0); proto_cleanup: txc->ops.cleanup(txc); @@ -434,16 +438,20 @@ void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc) struct cxip_req *req; struct dlist_entry *tmp; + ofi_genlock_lock(&txc->ep_obj->lock); + /* Drain the message queue. */ dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, req, send.txc_entry, tmp) { if (cxip_is_trig_req(req)) { - ofi_atomic_dec32(&txc->otx_reqs); + cxip_txc_otx_reqs_dec(txc); dlist_remove(&req->send.txc_entry); cxip_unmap(req->send.send_md); cxip_evtq_req_free(req); } } + + ofi_genlock_unlock(&txc->ep_obj->lock); } static bool cxip_txc_can_emit_op(struct cxip_txc *txc, @@ -456,7 +464,7 @@ static bool cxip_txc_can_emit_op(struct cxip_txc *txc, /* If taking a successful completion, limit outstanding operations */ if (!event_success_disabled && - (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size)) { + (cxip_txc_otx_reqs_get(txc) >= txc->attr.size)) { TXC_WARN(txc, "TXC attr size saturated\n"); return false; } @@ -483,7 +491,7 @@ int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc put: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -506,10 +514,10 @@ int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -534,7 +542,7 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, "Failed to emit trigger dma command: %d:%s\n", ret, fi_strerror(-ret)); else if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -545,7 +553,7 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain dma command: %d\n", ret); else if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -567,10 +575,10 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -594,7 +602,7 @@ int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc amo: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -617,10 +625,10 @@ int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -647,7 +655,7 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, "Failed to emit trigger amo command: %d:%s\n", ret, fi_strerror(-ret)); else if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -659,7 +667,7 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain amo: %d\n", ret); else if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -681,10 +689,10 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -708,7 +716,7 @@ int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc msg: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -731,10 +739,10 @@ int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -778,7 +786,7 @@ struct cxip_txc *cxip_txc_calloc(struct cxip_ep_obj *ep_obj, void *context) dlist_init(&txc->msg_queue); dlist_init(&txc->dom_entry); - ofi_atomic_initialize32(&txc->otx_reqs, 0); + cxip_txc_otx_reqs_init(txc); /* Derived initialization/overrides */ txc->ops.init_struct(txc, ep_obj); diff --git a/prov/cxi/test/atomic.c b/prov/cxi/test/atomic.c index d33dfdc455f..6f5ec289579 100644 --- a/prov/cxi/test/atomic.c +++ b/prov/cxi/test/atomic.c @@ -76,13 +76,13 @@ Test(atomic_invalid, invalid_amo) int ret; ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, - FI_UINT64, OFI_ATOMIC_OP_LAST, 0); + FI_UINT64, FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, - OFI_DATATYPE_LAST, FI_SUM, 0); + FI_VOID, FI_SUM, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, -1, FI_SUM, 0); @@ -132,13 +132,13 @@ Test(atomic_invalid, invalid_fetch) ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, - OFI_ATOMIC_OP_LAST, 0); + FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, - cxit_ep_fi_addr, 0, 0, OFI_DATATYPE_LAST, FI_SUM, + cxit_ep_fi_addr, 0, 0, FI_VOID, FI_SUM, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, @@ -220,7 +220,7 @@ Test(atomic_invalid, invalid_swap) &compare, 0, &result, 0, cxit_ep_fi_addr, 0, 0, - FI_UINT64, OFI_ATOMIC_OP_LAST, 0); + FI_UINT64, FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, @@ -234,7 +234,7 @@ Test(atomic_invalid, invalid_swap) &compare, 0, &result, 0, cxit_ep_fi_addr, 0, 0, - OFI_DATATYPE_LAST, FI_CSWAP_NE, NULL); + FI_VOID, FI_CSWAP_NE, NULL); cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, @@ -277,7 +277,7 @@ Test(atomic_invalid, invalid_swap) &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_CSWAP_NE, NULL); - + cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomicv(cxit_ep, &iov, 0, 1, &ciov, 0, 1, @@ -1037,6 +1037,18 @@ struct test_int_parms { uint64_t key; }; +static enum fi_datatype int_datatypes[] = { + FI_UINT8, + FI_INT16, + FI_UINT16, + FI_INT32, + FI_UINT32, + FI_INT64, + FI_UINT64, + FI_INT128, + FI_UINT128, +}; + static struct test_int_parms int_parms[] = { { _AMO|_FAMO, 11, FI_MIN, 0, 0, 123, 120, 120 }, { _AMO|_FAMO, 12, FI_MIN, 0, 0, 120, 123, 120 }, @@ -1128,42 +1140,73 @@ ParameterizedTestParameters(atomic, test_int) tests * 2); } + +/* Don't rely on compiler __int128 support. */ +typedef struct { + uint64_t u64[2]; +} __attribute__ ((aligned (16))) amo128_t; + +#define AMO128_INIT(_v64) { .u64 = { _v64, 0 } } + +static int test_int_expect_err(int err, enum fi_datatype dt, enum fi_op op) +{ + if (!err && op != FI_CSWAP && (dt == FI_INT128 || dt == FI_UINT128)) + err = 1; + + return err; +} + ParameterizedTest(struct test_int_parms *p, atomic, test_int) { struct mem_region mr; enum fi_datatype dt; uint64_t *rma; - uint64_t *loc; - uint64_t lini = -1; + uint64_t *loc = NULL; + int err; + /* Need 128-bit data types for FI_INT128/FI_UINT128. */ + amo128_t o1_128 = AMO128_INIT(p->o1); + void *o1 = &o1_128; + amo128_t comp_128 = AMO128_INIT(p->comp); + void *comp = &comp_128; + amo128_t lini_128 = AMO128_INIT(-1); + void *lini = &lini_128; + amo128_t rini_128 = AMO128_INIT(p->rini); + void *rini = &rini_128; + amo128_t rexp_128 = AMO128_INIT(p->rexp); + void *rexp = &rexp_128; + size_t i; rma = _cxit_create_mr(&mr, &p->key); - loc = calloc(1, RMA_WIN_LEN); - cr_assert_not_null(loc); + err = posix_memalign((void **)&loc, ofi_datatype_size(FI_UINT128), + RMA_WIN_LEN); + cr_assert(err == 0); + memset(loc, 0, RMA_WIN_LEN); if (p->opmask & _AMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - 0, 0, 0, - rma, &p->rini, &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + 0, 0, 0, rma, rini, rexp, p->key); } } if (p->opmask & _FAMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - 0, loc, &lini, rma, &p->rini, &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + 0, loc, lini, rma, rini, rexp, p->key); } } if (p->opmask & _CAMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - &p->comp, loc, &lini, rma, &p->rini, - &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + comp, loc, lini, rma, rini, rexp, p->key); } } @@ -1942,6 +1985,214 @@ void cxit_setup_amo_selective_completion_suppress(void) cxit_setup_rma(); } +void cxit_setup_amo_selective_completion_suppress_hybrid_mr_desc(void) +{ + int ret; + + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = 0; + cxit_setup_rma(); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v2"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL && + dom_ops->enable_hybrid_mr_desc != NULL, + "V3 functions returned"); + + ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true); + cr_assert(ret == FI_SUCCESS, "enable_hybrid_mr_desc failed"); +} + +Test(atomic_sel, fi_more_amo_stream_optimzied, + .init = cxit_setup_amo_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + char src_buf = 0; + + mr_create(rma_len, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + +Test(atomic_sel, fi_more_amo_stream_mix_optimzied_unoptimized, + .init = cxit_setup_amo_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region opt_mem_window; + struct mem_region mem_window; + uint64_t opt_key_val = 0x0; + uint64_t key_val = 0x1234; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + char src_buf = 0; + + mr_create(rma_len, FI_REMOTE_WRITE, 0, &opt_key_val, &opt_mem_window); + mr_create(rma_len, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = opt_key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + rma.key = key_val; + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); + mr_destroy(&opt_mem_window); +} + +Test(atomic_sel, fi_more_fetch_amo_stream_optimzied, + .init = cxit_setup_amo_selective_completion_suppress_hybrid_mr_desc, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_read_cntr; + char src_buf = 0; + struct fi_ioc result_iov = {}; + void *mr; + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + + mr_create(rma_len, + FI_REMOTE_WRITE | FI_REMOTE_READ | FI_WRITE | FI_READ, 0, + &key_val, &mem_window); + mr = fi_mr_desc(mem_window.mr); + + result_iov.addr = mem_window.mem; + result_iov.count = 1; + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, + FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, + FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + /* Test selective completion behavior with RMA. */ Test(atomic_sel, selective_completion_suppress, .init = cxit_setup_amo_selective_completion_suppress, @@ -3634,7 +3885,23 @@ ParameterizedTestParameters(atomic, query_atomic) .valid_atomic_attr = true, .flags = FI_FETCH_ATOMIC, .expected_rc = FI_SUCCESS, - } + }, + /* FI_UINT128 unsupported for FI_MIN. */ + { + .datatype = FI_UINT128, + .op = FI_MIN, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + }, + /* FI_UINT128 supported for FI_CSWAP. */ + { + .datatype = FI_UINT128, + .op = FI_CSWAP, + .valid_atomic_attr = true, + .flags = FI_COMPARE_ATOMIC, + .expected_rc = FI_SUCCESS, + }, }; size_t param_sz = ARRAY_SIZE(params); diff --git a/prov/cxi/test/cntr.c b/prov/cxi/test/cntr.c index f16655e0fbc..3c4f1c31241 100644 --- a/prov/cxi/test/cntr.c +++ b/prov/cxi/test/cntr.c @@ -844,3 +844,69 @@ Test(cntr, cntr_wait_success_increment) cntr_wait_success_and_error_runner(&args); } + +Test(cntr, verify_sync) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + uint64_t success; + int ret; + + ret = fi_cntr_open(cxit_domain, &cntr_attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_set(cntr, 2); + cr_assert(ret == FI_SUCCESS, "fi_cntr_set ret %d", ret); + + success = fi_cntr_read(cntr); + cr_assert(success == 2, + "Unexpected counter success count %lu", success); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} + +/* This test is non-deterministic in that the counter write back + * associated with the set can occur before the fi_cntr_read() + * is issued, invalidating the test. Disable the test until another + * approach is implemented. + */ +Test(cntr, verify_no_sync, .disabled = true) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = { + .wait_obj = FI_WAIT_UNSPEC, + .flags = FI_CXI_CNTR_CACHED, + }; + struct cxip_ep *ep = container_of(cxit_ep, struct cxip_ep, ep); + uint64_t success; + int ret; + + /* Test is only deterministic with netsim */ + if (!is_netsim(ep->ep_obj)) { + cr_assert(1); + return; + } + + ret = fi_cntr_open(cxit_domain, &cntr_attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_set(cntr, 2); + cr_assert(ret == FI_SUCCESS, "fi_cntr_set ret %d", ret); + + success = fi_cntr_read(cntr); + /* should have returned cached value */ + cr_assert(success == 0, + "Unexpected counter success count %lu", success); + + do { + success = fi_cntr_read(cntr); + } while (success < 2); + cr_assert(success == 2, + "Unexpected counter success count %lu", success); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c index 5ffb811567e..d88c95efa98 100644 --- a/prov/cxi/test/coll.c +++ b/prov/cxi/test/coll.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. - * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ /* @@ -37,7 +37,6 @@ #define MIN(a,b) (((a)<(b))?(a):(b)) -/***************************************/ /** * Sanity tests for proper integration with EP, enable/disable checks. */ @@ -367,7 +366,7 @@ Test(coll_join, retry_getgroup) { TRACE("=========================\n"); TRACE("join retry getgroup\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -381,7 +380,7 @@ Test(coll_join, retry_broadcast) { TRACE("=========================\n"); TRACE("join retry broadcast\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -395,7 +394,7 @@ Test(coll_join, retry_reduce) { TRACE("=========================\n"); TRACE("join retry reduce\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -409,9 +408,10 @@ Test(coll_join, fail_ptlte) { TRACE("=========================\n"); TRACE("join fail mixed errors\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EFAULT); + cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EAVAIL, + FI_CXI_ERRNO_JOIN_FAIL_PTE); _create_netsim_collective(5, true, FI_SUCCESS); - _wait_for_join(5, -FI_EAVAIL, CXIP_PROV_ERRNO_PTE); + _wait_for_join(5, -FI_ECONNREFUSED, FI_CXI_ERRNO_JOIN_FAIL_PTE); _destroy_netsim_collective(); cxip_trap_close(); } @@ -1058,7 +1058,7 @@ void _allreduce(int start_node, int bad_node, int concur) uint64_t expval, actval; /* If there was a bad node, all reductions should fail */ - rc_err0 = (bad_node < 0) ? 0 : CXIP_COLL_RC_OP_MISMATCH; + rc_err0 = (bad_node < 0) ? 0 : FI_CXI_ERRNO_RED_OP_MISMATCH; for (node = 0; node < nodes; node++) { _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node][first]); @@ -1820,8 +1820,6 @@ Test(coll_reduce_ops, bor) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1845,8 +1843,6 @@ Test(coll_reduce_ops, band) cr_assert(!ret, "_allreduceop() failed = %d\n", ret); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1870,8 +1866,6 @@ Test(coll_reduce_ops, bxor) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1895,8 +1889,6 @@ Test(coll_reduce_ops, imin) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1920,8 +1912,6 @@ Test(coll_reduce_ops, imax) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1945,8 +1935,6 @@ Test(coll_reduce_ops, isum) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1978,8 +1966,6 @@ Test(coll_reduce_ops, iminmaxloc) cr_assert(!ret, "_allreduceop() failed = %d\n", ret); ret = _check_iminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -2009,8 +1995,6 @@ Test(coll_reduce_ops, fsum) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INEXACT); - cr_assert(!ret, "rc failed\n"); /* Note: inexact computation is guaranteed by the small value included * in the data set. There is a hidden trick when performing the @@ -2040,8 +2024,6 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); data[1].fval[1] = NAN; _predict_fmin(nodes, data, &check, true); @@ -2049,8 +2031,8 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); + cr_assert(ret, "rc NAN succeeded\n"); data[1].fval[1] = _snan64(); _predict_fmin(nodes, data, &check, true); @@ -2058,8 +2040,8 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2080,8 +2062,6 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); data[1].fval[1] = NAN; _predict_fmax(nodes, data, &check, true); @@ -2089,8 +2069,8 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); + cr_assert(ret, "rc NAN succeeded\n"); data[1].fval[1] = _snan64(); _predict_fmax(nodes, data, &check, true); @@ -2098,8 +2078,8 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2132,8 +2112,6 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* NAN is given preference over number */ data[1].fminval = NAN; @@ -2144,8 +2122,6 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed NAN\n"); /* SNAN is given preference over NAN */ data[1].fminval = NAN; @@ -2157,8 +2133,8 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2180,8 +2156,6 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* number is given preference over NAN */ data[1].fval[1] = NAN; @@ -2191,8 +2165,8 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); + cr_assert(ret, "rc NAN succeeded\n"); /* number is given preference over NAN */ data[1].fval[1] = _snan64(); @@ -2202,8 +2176,8 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2225,8 +2199,6 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* number is given preference over NAN */ data[1].fval[1] = NAN; @@ -2236,8 +2208,8 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); + cr_assert(ret, "rc NAN succeeded\n"); /* SNAN is given preference over number */ data[1].fval[1] = _snan64(); @@ -2247,8 +2219,8 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2281,8 +2253,6 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* NAN is given preference over number */ data[1].fminval = NAN; @@ -2293,8 +2263,6 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed NAN\n"); /* SNAN is given preference over NAN */ data[1].fminval = NAN; @@ -2306,8 +2274,8 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2367,8 +2335,6 @@ Test(coll_reduce_ops, prereduce) /* validate results */ ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); free(accum1); free(mc_obj); diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c index 5398dcd98f3..b63432c2a73 100644 --- a/prov/cxi/test/cuda.c +++ b/prov/cxi/test/cuda.c @@ -31,7 +31,7 @@ static void cuda_init(void) srand(seed); } -TestSuite(cuda, .timeout = CXIT_DEFAULT_TIMEOUT, .init = cuda_init); +TestSuite(cuda, .timeout = 60, .init = cuda_init); static void cuda_message_runner(void *cuda_send_buf, void *cuda_recv_buf, size_t buf_size, bool device_only_mem, @@ -423,3 +423,195 @@ Test(cuda, verify_hmemDevReg) verify_dev_reg_handle(true); } + + +/* Verify that large transfers (4+ GiB) work. */ +#define LARGE_XFER ((4ULL * 1024 * 1024 * 1024) - 1) +Test(cuda, large_transfer) +{ + cuda_dev_memory_test(LARGE_XFER, 2, false, true); +} + +static void verify_dev_reg_eopnotsupp_local_op(void) +{ + void *buf; + cudaError_t cuda_ret; + size_t buf_size = 1024; + int ret; + + cuda_ret = cudaMalloc(&buf, buf_size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + ret = fi_recv(cxit_ep, buf, buf_size, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, -FI_EOPNOTSUPP, "fi_recv failed: %d", ret); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} + +static void verify_dev_reg_eopnotsupp_remote_mr(void) +{ + int ret; + void *buf; + cudaError_t cuda_ret; + size_t buf_size = 1024; + struct fid_mr *fid_mr; + + cuda_ret = cudaMalloc(&buf, buf_size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0, + &fid_mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_mr_bind(fid_mr, &(cxit_ep->fid), 0); + cr_assert_eq(ret, -FI_EOPNOTSUPP, "fi_mr_bind failed: %d", ret); + + ret = fi_close(&fid_mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} + +Test(cuda, verify_fi_opt_cuda_api_permitted_local_operation) +{ + int ret; + bool optval = false; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + verify_dev_reg_eopnotsupp_local_op(); + + cxit_teardown_msg(); +} + +Test(cuda, verify_fi_opt_cuda_api_permitted_remote_mr) +{ + int ret; + bool optval = false; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + verify_dev_reg_eopnotsupp_remote_mr(); + + cxit_teardown_msg(); +} + +Test(cuda, verify_get_fi_opt_cuda_api_permitted) +{ + int ret; + bool optval = false; + size_t size = sizeof(optval); + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + optval = true; + + ret = fi_getopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, &size); + assert(ret == FI_SUCCESS); + + assert(optval == false); + + cxit_teardown_msg(); +} + +Test(cuda, verify_force_dev_reg_local) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_CXI_FORCE_DEV_REG_COPY", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_fi_hints->tx_attr->size = 512; + + cxit_setup_ep(); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret != FI_SUCCESS, "ret is: %d\n", ret); + + /* Tear down RMA objects */ + cxit_destroy_ep(); /* EP must be destroyed before bound objects */ + + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_teardown_ep(); +} + +Test(cuda, dmabuf_stress) +{ + int ret; + int i; + void *buf; + size_t size = 1024 * 1024; + struct fid_mr *mr; + cudaError_t cuda_ret; + + ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_CUDA_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cuda_ret = cudaMalloc(&buf, size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + cxit_setup_msg(); + + for (i = 0; i < 2048; i++) { + ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE, + 0, 0, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_close(&mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + } + + cxit_teardown_msg(); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} diff --git a/prov/cxi/test/cxi_vm_commit.sh b/prov/cxi/test/cxi_vm_commit.sh new file mode 100755 index 00000000000..d3bb8807ed9 --- /dev/null +++ b/prov/cxi/test/cxi_vm_commit.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +test_short="" +exclude_commit_subject="" + +while getopts "se:" option; do + case "${option}" in + s) + test_short="-s" + ;; + e) + exclude_commit_subject=${OPTARG} + ;; + *) + exit 1; + esac +done + +# Assumes that the commit subject is unique between all commits in the PR. +head_commit_subject_collapsed=$(git log -1 --pretty=%s | tr -d ' ') +if [[ "$head_commit_subject_collapsed" == "$exclude_commit_subject" ]]; then + echo "Skippping commit \"$(git log -1 --pretty=%s)\"" + exit 0 +fi + +git log -1 + +set -e + +cd ../../../ +./autogen.sh +./configure \ + --prefix=$PWD/install \ + --disable-sockets \ + --disable-udp \ + --disable-verbs \ + --disable-rxm \ + --disable-mrail \ + --disable-rxd \ + --disable-shm \ + --disable-tcp \ + --disable-usnic \ + --disable-rstream \ + --disable-efa \ + --disable-psm2 \ + --disable-psm3 \ + --disable-opx \ + --enable-debug \ + --with-default-monitor=uffd \ + --with-criterion=$(realpath ../Criterion/build/install/) \ + --with-cassini-headers=$(realpath ../cassini-headers/install) \ + --with-cxi-uapi-headers=$(realpath ../cxi-driver) \ + --enable-cxi=$(realpath ../libcxi/install) \ + --with-kdreg2=$(realpath ../kdreg2/include) + + +make clean +make -j 8 install + +test_dir=$(realpath ./prov/cxi/test) +test_result_file="run_tests_vm_output.txt" +ssh -tt localhost "cd ${test_dir}; ./run_tests_vm.sh $test_short" | tee ${test_result_file} + +set +e + +# Search ssh output for the following string. This is a test failure +# which is not reported as a tap test failure. +test_error_code=1 +test_error=$(grep "cxitest return non-zero exit code. Possible failures in test teardown" ${test_result_file}) || test_error_code=$(($?^1)) +if [ -z "${test_error}" ] && [ "$test_error_code" -eq "0" ]; then + echo "Zero 'non-zero exit codes' failures in output" +else + echo $test_error + exit 1 +fi + +# Grep all tap out files for "not ok" string. This is a test failure. +test_failures_code=1 +test_failures=$(grep "not ok" ${test_dir}/*.tap) || test_failures_code=$(($?^1)) +if [ -z "${test_failures}" ] && [ "$test_failures_code" -eq "0" ] ; then + echo "Zero 'not ok' failures in tap output" +else + echo $test_failures + exit 1 +fi + +signed_off=$(git log -1 | grep -i "Signed-off-by: ") +if [ -z "${signed_off}" ]; then + echo "Commit not signed off" + exit 1 +else + echo "Commit signed-off check passed" +fi + +echo "Tests passed" +rm ${test_result_file} +exit 0 diff --git a/prov/cxi/test/cxi_vm_pr.sh b/prov/cxi/test/cxi_vm_pr.sh new file mode 100755 index 00000000000..c10c0da1ac6 --- /dev/null +++ b/prov/cxi/test/cxi_vm_pr.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Cache head commit which will be cherry-picked later +head_commit=$(git rev-parse HEAD) + +git checkout -b rebase-test-branch +db=$(git remote show https://github.com/ofiwg/libfabric.git | grep 'HEAD branch' | cut -d' ' -f5) +mb=$(git merge-base origin/${db} HEAD) + +# Run a shorten test suite against each commits except the head commit. +git reset --hard HEAD~1 +git rebase ${mb} --exec "bash ./cxi_vm_commit.sh -s" +if [[ $? -ne 0 ]]; then + exit 1 +fi + +# Run longer test suite against all commits together. +git cherry-pick ${head_commit} +bash ./cxi_vm_commit.sh diff --git a/prov/cxi/test/cxip_test_common.c b/prov/cxi/test/cxip_test_common.c index b0fe3ccd622..fd3fec4b5c5 100644 --- a/prov/cxi/test/cxip_test_common.c +++ b/prov/cxi/test/cxip_test_common.c @@ -774,6 +774,7 @@ void cxit_setup_enabled_ep_fd(void) cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->threading = FI_THREAD_SAFE; cxit_setup_ep(); diff --git a/prov/cxi/test/deferred_work.c b/prov/cxi/test/deferred_work.c index 369e276ffad..7531f49b439 100644 --- a/prov/cxi/test/deferred_work.c +++ b/prov/cxi/test/deferred_work.c @@ -960,6 +960,7 @@ static int alloc_service(struct cxil_dev *dev, unsigned int tle_count) struct cxi_svc_fail_info fail_info = {}; struct cxi_svc_desc svc_desc = { .enable = 1, + .resource_limits = 1, .limits = { .type[CXI_RSRC_TYPE_PTE] = { .max = 100, @@ -1195,11 +1196,10 @@ Test(deferred_work_trig_op_limit, enforce_limit_single_thread) cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret); } - ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); - if (limited) + if (limited) { + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); cr_assert_eq(ret, -FI_ENOSPC, "FI_QUEUE_WORK failed %d", ret); - else - cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + } cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS)); diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c index e415c11f018..c49029ef808 100644 --- a/prov/cxi/test/ep.c +++ b/prov/cxi/test/ep.c @@ -1,7 +1,8 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ #include @@ -294,17 +295,6 @@ Test(ep, ep_bind_stx_ctx) "TODO Add test for STX CTXs binding to the endpoint when implemented"); } -Test(ep, ep_bind_srx_ctx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - void *context = NULL; - - ret = fi_srx_context(cxit_domain, attr, NULL, context); - cr_assert_eq(ret, -FI_ENOSYS, - "TODO Add test for SRX CTXs binding to the endpoint when implemented"); -} - Test(ep, ep_bind_unhandled) { int ret; @@ -969,53 +959,6 @@ Test(ep, stx_ctx) cr_assert_eq(ret, FI_SUCCESS, "fi_close stx_ep. %d", ret); } -Test(ep, srx_ctx_null_srx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - void *context = NULL; - - ret = fi_srx_context(cxit_domain, attr, NULL, context); - /* TODO Fix when fi_srx_context is implemented, should be -FI_EINVAL */ - cr_assert_eq(ret, -FI_ENOSYS, "fi_srx_context null srx. %d", ret); -} - -Test(ep, srx_ctx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - struct fid_ep *srx; - struct cxip_ep *srx_ep; - void *context = &ret; - struct cxip_domain *dom; - struct cxip_rxc *rxc; - int refs; - - dom = container_of(cxit_domain, struct cxip_domain, - util_domain.domain_fid); - refs = ofi_atomic_get32(&dom->ref); - - ret = fi_srx_context(cxit_domain, attr, &srx, context); - /* TODO Fix when fi_srx_context is implemented, should be FI_SUCCESS */ - cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret); - if (ret == -FI_ENOSYS) - return; - - srx_ep = container_of(srx, struct cxip_ep, ep); - rxc = srx_ep->ep_obj->rxc; - - /* Validate stx */ - cr_assert_eq(rxc->domain, dom); - cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1); - cr_assert_eq(srx_ep->ep.fid.fclass, FI_CLASS_RX_CTX); - cr_assert_eq(srx_ep->ep.fid.context, context); - cr_assert_eq(rxc->state, RXC_ENABLED); - cr_assert_eq(rxc->min_multi_recv, CXIP_EP_MIN_MULTI_RECV); - - ret = fi_close(&srx->fid); - cr_assert_eq(ret, FI_SUCCESS, "fi_close srx_ep. %d", ret); -} - TestSuite(ep_init, .timeout = CXIT_DEFAULT_TIMEOUT); Test(ep_init, auth_key) @@ -1758,8 +1701,8 @@ Test(ep_caps, coll_only) &info); cr_assert(ret == FI_SUCCESS); verify_caps_only(info, FI_COLLECTIVE | FI_MSG); - fi_freeinfo(info); + cxit_teardown_getinfo(); } diff --git a/prov/cxi/test/eq.c b/prov/cxi/test/eq.c index 00730982b22..1d31bd4bf9e 100644 --- a/prov/cxi/test/eq.c +++ b/prov/cxi/test/eq.c @@ -27,7 +27,7 @@ TestSuite(eq, .init = cxit_setup_eq, .fini = cxit_teardown_eq, .timeout = CXIT_DEFAULT_TIMEOUT); -/* Test basic CQ creation */ +/* Test basic EQ creation */ Test(eq, simple) { cxit_create_eq(); @@ -35,3 +35,28 @@ Test(eq, simple) cxit_destroy_eq(); } +void eq_bad_wait_obj(enum fi_wait_obj wait_obj) + +{ + struct fi_eq_attr attr = { + .size = 32, + .flags = FI_WRITE, + .wait_obj = wait_obj, + }; + int ret; + + ret = fi_eq_open(cxit_fabric, &attr, &cxit_eq, NULL); + cr_assert(ret == -FI_ENOSYS, "fi_eq_open unexpected success"); + cr_assert(cxit_eq == NULL, "cxit_eq not NULL on bad wait_obj"); +} + +Test(eq, bad_wait_obj_unspec) +{ + eq_bad_wait_obj(FI_WAIT_UNSPEC); +} + +Test(eq, bad_wait_obj_wait_fd) +{ + eq_bad_wait_obj(FI_WAIT_UNSPEC); +} + diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c index fab3cbab7d7..fd35fe5fbc4 100644 --- a/prov/cxi/test/mr.c +++ b/prov/cxi/test/mr.c @@ -51,6 +51,26 @@ Test(mr, invalid_fi_directed_recv_flag) cr_assert_eq(ret, -FI_EINVAL, "fi_mr_regattr failed: %d", ret); } +Test(mr, invalid_client_rkey) +{ + int ret; + struct fi_mr_attr attr = {}; + struct iovec iov = {}; + struct fid_mr *mr; + + iov.iov_len = sizeof(ret); + iov.iov_base = (void *)&ret; + + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = FI_REMOTE_READ | FI_REMOTE_WRITE; + attr.requested_key = ~1; + + ret = fi_mr_regattr(cxit_domain, &attr, 0, &mr); + if ((cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) != FI_MR_PROV_KEY) + cr_assert_eq(ret, -FI_EKEYREJECTED, "fi_mr_regattr failed: %d", ret); +} + Test(mr, std_mrs, .timeout = 600, .disabled = true) { int std_mr_cnt = 16*1024; @@ -159,7 +179,7 @@ Test(mr, mr_zero_len) /* Validate that unique keys are enforced. */ Test(mr, mr_unique_key) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; int ret; @@ -185,7 +205,7 @@ Test(mr, mr_unique_key) /* Validate not recycling non-cached FI_MR_PROV_KEY */ Test(mr, mr_recycle) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; struct fid_mr *mr3; @@ -273,7 +293,7 @@ Test(mr, mr_recycle) /* Validate that RKEY are not required for local MR */ Test(mr, mr_no_local_rkey) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; uint64_t rkey = 0; diff --git a/prov/cxi/test/mr_cache.c b/prov/cxi/test/mr_cache.c new file mode 100644 index 00000000000..b2035cb8063 --- /dev/null +++ b/prov/cxi/test/mr_cache.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +#define SETENV_OVERWRITE 1 + +TestSuite(mr_cache, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(mr_cache, cache_full) +{ + static struct { + const char *name; + const char *value; + } envs[] = { + { .name = "FI_MR_CACHE_MONITOR", .value = "kdreg2", }, + { .name = "FI_MR_CACHE_MAX_COUNT", .value = "4", }, + }; + struct { + void *addr; + struct fid_mr *mr; + } *region_data; + size_t i; + int ret; + long page_size; + unsigned long num_regions, total_regions; + struct ofi_mr_cache *cache; + struct cxip_domain *cxip_dom; + + /* setup the environment */ + for (i = 0; i < ARRAY_SIZE(envs); i++) { + ret = setenv(envs[i].name, envs[i].value, SETENV_OVERWRITE); + cr_assert_eq(ret, 0, "Failed to set %s to %s: %d", + envs[i].name, envs[i].value, errno); + } + + /* allocate the memory regions */ + page_size = sysconf(_SC_PAGESIZE); + cr_assert(page_size > 0, + "sysconf(_SC_PAGESIZE) return %ld: errno = %d", page_size, errno); + + ret = sscanf(getenv("FI_MR_CACHE_MAX_COUNT"), "%lu", &num_regions); + cr_assert_eq(ret, 1, "Failed to get number of regions: %d %d:%s", + ret, errno, strerror(errno)); + + /* one extra to push one out of the cache */ + total_regions = num_regions + 1; + region_data = calloc(total_regions, sizeof(*region_data)); + cr_assert_not_null(region_data); + + for (i = 0; i < total_regions; i++) { + region_data[i].addr = mmap(NULL, page_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, 0); + cr_assert_not_null(region_data[i].addr); + } + + /* create the domain */ + cxit_setup_domain(); + cxit_create_domain(); + + /* Register the max number of regions */ + for (i = 0; i < num_regions; i++) { + ret = fi_mr_reg(cxit_domain, region_data[i].addr, + page_size, FI_READ | FI_WRITE, + 0, 0, 0, ®ion_data[i].mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_reg failed for region %lu: %d", i, ret); + } + + /* See that the cache is full */ + cxip_dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cache = &cxip_dom->iomm; + cr_assert(cache->cached_cnt == cache->cached_max_cnt, + "Cache is not full: %zu != %zu", + cache->cached_cnt, cache->cached_max_cnt); + cr_assert(cache->uncached_cnt == 0, + "Cache has uncached entries: %zu", + cache->uncached_cnt); + + /* release the registrations, this should put them on the LRU list */ + for(i = 0; i < num_regions; i++) { + ret = fi_close(®ion_data[i].mr->fid); + cr_assert_eq(ret, FI_SUCCESS, + "Failed to close mr %zu: %d", + i, ret); + } + + /* Register one more, this should push one off LRU list */ + ret = fi_mr_reg(cxit_domain, region_data[num_regions].addr, + page_size, FI_READ | FI_WRITE, + 0, 0, 0, ®ion_data[num_regions].mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_reg failed for region %lu: %d", num_regions, ret); + + /* Cache should remain full */ + cr_assert(cache->cached_cnt == cache->cached_max_cnt, + "Cache is not full: %zu != %zu", + cache->cached_cnt, cache->cached_max_cnt); + cr_assert(cache->uncached_cnt == 0, + "Cache has uncached entries: %zu", + cache->uncached_cnt); + + cxit_teardown_domain(); +} diff --git a/prov/cxi/test/multinode/test_coll.c b/prov/cxi/test/multinode/test_coll.c index 3dc80b86278..a8bff9127e6 100644 --- a/prov/cxi/test/multinode/test_coll.c +++ b/prov/cxi/test/multinode/test_coll.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ /* @@ -27,6 +27,7 @@ #include #include #include +#include #include "multinode_frmwk.h" /* If not compiled with DEBUG=1, this is a no-op */ @@ -313,12 +314,12 @@ struct join_item { struct dlist_entry entry; struct fid_av_set *avset; struct fid_mc *mc; + int join_index; int prov_errno; int retval; - int trace_no; }; -/* poll the collective eq once, count of completions (0 or 1) */ +/* poll the collective eq once, return 0 on success, errno on failure */ static int _poll_eq(void) { struct cxip_ep *ep; @@ -333,31 +334,38 @@ static int _poll_eq(void) jctx = NULL; ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0); + /* silent retry*/ + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + /* simple response */ if (ret >= 0) { TRACE("read EQ = %d\n", ret); if (ret < sizeof(struct fi_eq_entry)) { - TRACE("fi_eq_read()=%d, exp=%ld\n", + TRACE("fi_eq_read()=%d, exp=%ld, too small\n", ret, sizeof(struct fi_eq_entry)); return -FI_EINVAL; } - TRACE("=== EQ SUCCESS\n"); + TRACE("EQ RESPONSE\n"); TRACE(" size = %d\n", ret); TRACE(" event = %d\n", event); TRACE(" fid = %p\n", eqd.fid); TRACE(" context = %p\n", eqd.context); TRACE(" data = %lx\n", eqd.data); - if (eqd.context && event == FI_JOIN_COMPLETE) { - jctx = eqd.context; - jctx->retval = 0; - jctx->prov_errno = 0; - return 1; + if (!eqd.context || event != FI_JOIN_COMPLETE) { + TRACE("Unexpected eqd response\n"); + return -FI_EINVAL; } + TRACE("=== EQ SUCCESS\n"); + jctx = eqd.context; + jctx->retval = 0; + jctx->prov_errno = 0; + return FI_SUCCESS; } if (ret == -FI_EAVAIL) { TRACE("read EQ = %d\n", ret); ret = fi_eq_readerr(eq, &eqd, 0); if (ret < sizeof(struct fi_eq_err_entry)) { - TRACE("fi_eq_readerr()=%d, exp=%ld\n", + TRACE("fi_eq_readerr()=%d, exp=%ld too small\n", ret, sizeof(struct fi_eq_err_entry)); return -FI_EINVAL; } @@ -367,17 +375,18 @@ static int _poll_eq(void) TRACE(" fid = %p\n", eqd.fid); TRACE(" context = %p\n", eqd.context); TRACE(" data = %lx\n", eqd.data); - TRACE(" err = %s (%d)\n", - fi_strerror(-eqd.err), eqd.err); + TRACE(" err = %s (%d)\n", fi_strerror(-eqd.err), eqd.err); TRACE(" prov_err= %d\n", eqd.prov_errno); TRACE(" err_data= %p\n", eqd.err_data); TRACE(" err_size= %ld\n", eqd.err_data_size); - if (eqd.context) { - jctx = eqd.context; - jctx->retval = eqd.err; - jctx->prov_errno = eqd.prov_errno; - return 1; + if (!eqd.context) { + TRACE("Unexpected eqd response\n"); + return -FI_EINVAL; } + jctx = eqd.context; + jctx->retval = eqd.err; + jctx->prov_errno = eqd.prov_errno; + return FI_SUCCESS; } if (ret != -FI_EAGAIN) { TRACE("read EQ = %d\n", ret); @@ -392,6 +401,9 @@ static int _poll_eq(void) void coll_multi_release(struct dlist_entry *joinlist) { struct join_item *jctx; + int poll_count = 0; + int count = 0; + int ret; TRACE("coll_multi_release\n"); while (!dlist_empty(joinlist)) { @@ -399,14 +411,33 @@ void coll_multi_release(struct dlist_entry *joinlist) TRACE("close mc, empty = %d\n", dlist_empty(joinlist)); if (jctx->mc) fi_close(&jctx->mc->fid); + TRACE("free jctx\n"); free(jctx); + count++; } - TRACE("return\n"); + while (count > 0) { + ret = cxip_curl_progress(NULL); + if (ret == -FI_EAGAIN) { + poll_count++; + usleep(10); + continue; + } + if (ret < 0 && ret != -FI_ENODATA) { + TRACE("Curl progress failed, count=%d error=%d\n", + count, ret); + break; + } + count--; + } + TRACE("CURL cleanup delay = %d usec\n", 10*poll_count); } -/* initiate join on all sets in setary, and append to joinlist */ -int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) +/* initiate join on all sets in setary, and append to joinlist + * must succeed completely or cleans up and reports failure + */ +int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist, + int limit) { struct join_item *jctx; int i, ret, total, count; @@ -418,9 +449,14 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) count = 0; for (i = 0; i < total; i++) { jctx = calloc(1, sizeof(*jctx)); - jctx->trace_no = i; - jctx->avset = setary->avset[i]; + if (!jctx) { + TRACE("calloc failed on jctx[%d]\n", i); + ret = -FI_ENOMEM; + goto fail; + } dlist_init(&jctx->entry); + jctx->join_index = i; + jctx->avset = setary->avset[i]; TRACE("join %d of %d initiating\n", i, total); ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, setary->avset[i], 0L, &jctx->mc, jctx); @@ -429,16 +465,21 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) free(jctx); continue; } - TRACE("join %d continuing ret=%d\n", i, ret); if (ret != FI_SUCCESS) { - TRACE("join %d FAILED\n", ret); + TRACE("join %d FAILED join %d\n", i, ret); + free(jctx); goto fail; } /* wait for join to complete */ do { _poll_cqs(); ret = _poll_eq(); - } while (ret == 0); + } while (ret == -FI_EAGAIN); + if (ret < 0) { + TRACE("join %d FAILED eq poll %d\n", i, ret); + free(jctx); + goto fail; + } dlist_insert_tail(&jctx->entry, joinlist); count++; } @@ -446,7 +487,7 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) return FI_SUCCESS; fail: - TRACE("TEST failed\n"); + TRACE("MULTIJOIN failed\n"); coll_multi_release(joinlist); return ret; } @@ -499,8 +540,8 @@ struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, } dlist_init(joinlist); - ret = coll_multi_join(setary, joinlist); - if (ret) { + ret = coll_multi_join(setary, joinlist, -1); + if (ret < 0) { TRACE("%s JOIN coll_multi_join()=%d\n", msg, ret); goto quit; } @@ -526,46 +567,6 @@ struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, return NULL; } -#if 0 -int _test_multi_barrier(struct avset_ary *setary, struct dlist_entry *joinlist, - int N, long *nsec_delay, int total_secs) -{ - struct timespec *nsec_times, nsec_start; - int i, ret; - - nsec_times = calloc(sizeof(struct timespec), N); - ret = coll_init_multi_join(setary, joinlist); - if (ret) { - TRACE("multicast_join init error = %d\n", ret); - goto quit; - } - ret = coll_wait_multi_join(joinlist); - if (ret) { - TRACE("multicast_join wait error = %d\n", ret); - goto quit; - } - - _nsecs_from_now(&nsec_start, 0L); - nsec_start.tv_sec += total_secs; - - for (i = 0; i < N; i++) - _nsecs_from_now(&nsec_times[i], nsec_delay[i]); - while (!_nsecs_expired(&nsec_start)) { - for (i = 0; i < N; i++) { - if (!_nsecs_expired(&nsec_times[i])) - continue; - for (j = 0; j < ) - } - - } -quit: - free(nsec_times); - coll_multi_releasejoinlist); - avset_ary_destroy(setary); - return ret; -} -#endif - int _simple_join(fi_addr_t *fiaddrs, size_t size, struct avset_ary *setary, struct dlist_entry *joinlist) @@ -578,8 +579,8 @@ int _simple_join(fi_addr_t *fiaddrs, size_t size, return ret; dlist_init(joinlist); - ret = coll_multi_join(setary, joinlist); - if (ret) + ret = coll_multi_join(setary, joinlist, -1); + if (ret < 0) return ret; return 0; @@ -590,43 +591,28 @@ uint64_t _simple_get_mc(struct dlist_entry *joinlist) struct join_item *jctx; jctx = dlist_first_entry_or_null(joinlist, struct join_item, entry); + if (jctx == NULL) { + TRACE("Join item is NULL\n"); + return 0; + } return (uint64_t)jctx->mc; } void _simple_join_release(struct avset_ary *setary, struct dlist_entry *joinlist) { - coll_multi_release(joinlist); - avset_ary_destroy(setary); -} - -/** - * @brief Simple test of join/delete returns a count of errors. - * - * This creates a single avset_ary from the supplied addresses, with hwroot - * of zero, and performs a single join, tests errors, and cleans up. Used to - * probe the basic error conditions. - */ -int _test_join(fi_addr_t *fiaddrs, size_t size) -{ - struct avset_ary setary; - struct dlist_entry joinlist; - int ret; - - ret = _simple_join(fiaddrs, size, &setary, &joinlist); - _simple_join_release(&setary, &joinlist); - - return ret; + coll_join_cleanup(setary, joinlist); } -/* Simple test of barrier, returns a count of errors. */ -int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) +/* Simple test of count barriers, returns a count of errors. */ +int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; uint64_t context; uint64_t mc; - int i, ret, total; + int i, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); ret = _simple_join(fiaddrs, size, &setary, &joinlist); @@ -651,7 +637,6 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) TRACE("spin 1...\n"); _wait_cqs(&context); TRACE("BARRIER COMPLETE #%d\n", i); - total++; } else { TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); goto quit; @@ -661,20 +646,22 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) quit: TRACE("BARRIER exit\n"); - frmwk_log0("Barrier total=%d\n", total); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } -/* Simple test of broadcast, returns a count of errors. */ -int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) +/* Simple test of count broadcasts, returns a count of errors. */ +int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; uint64_t data[4], rslt[4]; uint64_t context; uint64_t mc; - int i, ret; + int i, root, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); @@ -691,49 +678,56 @@ int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) goto quit; } - data[0] = 0x12345678; - data[1] = 0x2468ace0; - data[2] = 0x13579bdf; - data[3] = 0x10101010; - memset(rslt, 0, sizeof(rslt)); - if (frmwk_rank == rootidx) - memcpy(rslt, data, sizeof(rslt)); - do { - _poll_cqs(); - ret = fi_broadcast(cxit_ep, rslt, 4, NULL, - mc, fiaddrs[rootidx], - FI_UINT64, 0L, &context); - } while (ret == -FI_EAGAIN); - if (ret) - goto quit; + for (i = 0; i < count; i++) { + for (root = 0; root < size; root++) { + data[0] = i; + data[1] = root; + data[2] = 0x13579bdf; + data[3] = 0x10101010; + memset(rslt, 0, sizeof(rslt)); + if (frmwk_rank == root) + memcpy(rslt, data, sizeof(rslt)); + do { + _poll_cqs(); + ret = fi_broadcast(cxit_ep, rslt, 4, NULL, mc, + fiaddrs[root], FI_UINT64, 0L, + &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("BROADCAST COMPLETE\n"); - if (memcmp(rslt, data, sizeof(rslt))) { - for (i = 0; i < 4; i++) - TRACE("[%d] %016lx exp %016lx\n", - i, rslt[i], data[i]); - ret = -1; + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BROADCAST COMPLETE\n"); + if (memcmp(rslt, data, sizeof(rslt))) { + for (i = 0; i < 4; i++) + TRACE("[%d] %016lx exp %016lx\n", + i, rslt[i], data[i]); + ret = -1; + } + } } quit: TRACE("BROADCAST exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } const struct timespec usec1 = {.tv_sec = 0, .tv_nsec = 10000}; -/* simple test of allreduce, returns a count of errors. */ -int _test_allreduce(fi_addr_t *fiaddrs, size_t size) +/* simple test of count allreduce int sums, returns a count of errors. */ +int _test_allreduce_isum(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; int64_t *data, *rslt, *comp; uint64_t context; uint64_t mc; - int r, v, ret; + int i, r, v, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); @@ -755,29 +749,107 @@ int _test_allreduce(fi_addr_t *fiaddrs, size_t size) data = calloc(frmwk_numranks*4, sizeof(int64_t)); comp = calloc(4, sizeof(int64_t)); rslt = calloc(4, sizeof(int64_t)); - for (v = 0; v < 4; v++) - for (r = 0; r < frmwk_numranks; r++) - data[4*r + v] = 4*r + v; - for (v = 0; v < 4; v++) - for (r = 0; r < frmwk_numranks; r++) - comp[v] += data[4*r + v]; - do { - _poll_cqs(); - ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, - rslt, NULL, mc, FI_INT64, - FI_SUM, 0L, &context); - } while (ret == -FI_EAGAIN); - if (ret) + for (i = 0; i < count; i++) { + memset(data, 0, frmwk_numranks * 4 * sizeof(int64_t)); + memset(comp, 0, 4 * sizeof(int64_t)); + memset(rslt, 0, 4 * sizeof(int64_t)); + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + data[4*r + v] = 4*r + v; + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + comp[v] += data[4*r + v]; + do { + _poll_cqs(); + ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, + rslt, NULL, mc, FI_INT64, + FI_SUM, 0L, &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; + + TRACE("spin...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + for (v = 0; v < 4; v++) { + if (rslt[v] != comp[v]) { + TRACE("[%d] %016lx exp %016lx\n", + v, rslt[v], comp[v]); + ret = 1; + } + } + } + free(rslt); + free(comp); + free(data); + +quit: + TRACE("ALLREDUCE exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); + _simple_join_release(&setary, &joinlist); + return ret; +} + +/* simple test of allreduce double sums, returns a count of errors. */ +int _test_allreduce_dsum(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + double *data, *rslt, *comp; + uint64_t context; + uint64_t mc; + int i, r, v, ret; + + TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); + + ret = _simple_join(fiaddrs, size, &setary, &joinlist); + if (ret) { + TRACE("join failed\n"); + goto quit; + } + + mc = _simple_get_mc(&joinlist); + if (!mc) { + TRACE("ALLREDUCE MC invalid\n"); + ret = -1; goto quit; + } + if (_is_hwroot(_get_join_jctx(&joinlist, 0))) + nanosleep(&usec1, NULL); - TRACE("spin...\n"); - _wait_cqs(&context); - TRACE("ALLREDUCE COMPLETE\n"); - for (v = 0; v < 4; v++) { - if (rslt[v] != comp[v]) { - TRACE("[%d] %016lx exp %016lx\n", - v, rslt[v], comp[v]); - ret = 1; + data = calloc(frmwk_numranks*4, sizeof(double)); + comp = calloc(4, sizeof(double)); + rslt = calloc(4, sizeof(double)); + ret = 0; + for (i = 0; i < count; i++) { + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + data[4*r + v] = (4*r + v)/1000.0; + for (v = 0; v < 4; v++) { + comp[v] = 0.0; + for (r = 0; r < frmwk_numranks; r++) + comp[v] += data[4*r + v]; + } + do { + _poll_cqs(); + ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, + rslt, NULL, mc, FI_DOUBLE, + FI_SUM, 0L, &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; + + TRACE("spin...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + for (v = 0; v < 4; v++) { + if (fabs(rslt[v] - comp[v]) > 0.00000001) { + TRACE("[%d] %f exp %f\n", + v, rslt[v], comp[v]); + ret = 1; + } } } free(rslt); @@ -786,6 +858,8 @@ int _test_allreduce(fi_addr_t *fiaddrs, size_t size) quit: TRACE("ALLREDUCE exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } @@ -823,8 +897,39 @@ static uint64_t testmask = 0L; if (skip) break; \ ret = 0 +static uint64_t get_range_mask(char *str) +{ + uint64_t mask = 0L; + char *s, *p; + int i, j; + + while (*str) { + while (*str == ' ') + str++; + s = str; + while (*str && *str != ',') + str++; + if (*str) + *str++ = 0; + p = s; + while (*p && *p != '-') + p++; + if (*p) + *p++ = 0; + i = (*s) ? atoi(s) : 0; + j = (*p) ? atoi(p) : i; + if (j > 63) + j = 63; + while (i <= j) { + mask |= (1L << i++); + } + } + return mask; +} + int main(int argc, char **argv) { + struct cxip_coll_metrics metrics; fi_addr_t *fiaddrs = NULL; fi_addr_t myaddr; struct cxip_addr mycaddr; @@ -832,10 +937,11 @@ int main(int argc, char **argv) size_t size = 0; int errcnt = 0; int tstcnt = 0; + int skpcnt = 0; int tstnum = 0; int ret = 0; - int N = 0; - int S = 1; + int trees = 1; + int opcount = 1; bool help = false; bool trace_muted = true; struct join_item *jctx; @@ -843,53 +949,31 @@ int main(int argc, char **argv) struct dlist_entry joinlist; const char *testname; char opt; - int i, j; + int i; /* by default, perform all tests */ testmask = -1L; testname = NULL; setvbuf(stdout, NULL, _IONBF, 0); - while ((opt = getopt(argc, argv, "hvVS:Mt:N:")) != -1) { - char *str, *s, *p; - + while ((opt = getopt(argc, argv, "hvVS:Mt:N:n:")) != -1) { switch (opt) { case 't': /* perform only selected tests */ - str = optarg; - i = j = 0; - testmask = 0L; - while (*str) { - while (*str == ' ') - str++; - s = str; - while (*str && *str != ',') - str++; - if (*str) - *str++ = 0; - p = s; - while (*p && *p != '-') - p++; - if (*p) - *p++ = 0; - i = (*s) ? atoi(s) : 0; - j = (*p) ? atoi(p) : i; - if (j > 63) - j = 63; - while (i <= j) { - testmask |= (1L << i++); - } - } + testmask = get_range_mask(optarg); + break; + case 'x': + /* exclude all selected tests */ + testmask = ~get_range_mask(optarg); break; case 'M': create_multicast = true; break; case 'N': - N = atoi(optarg); + trees = atoi(optarg); break; - case 'S': - S = atoi(optarg); - printf("S = %d\n", S); + case 'n': + opcount = atoi(optarg); break; case 'V': /* tracing is enabled below */ @@ -916,11 +1000,17 @@ int main(int argc, char **argv) do { if (help) { frmwk_log0( - "Usage: t est_coll [-hvV] -M -Ncount [-t testno[-testno][,...]]\n" - " -h generate help and quit.\n" + "Usage: test_coll [-hvV] -M -N treecnt -n opcnt\n" + " [-t testno[-testno][,...]\n" + " [-x testno[-testno][,...]]...\n" + " -h generate help and quit\n" + " -v verbose to stdout\n" + " -V verbose to trace files\n" " -M use multicast model (default unicast model)\n" - " -N iterations (default 1)\n" - " -t test list (default all)\n"); + " -N concurrent trees (default 1)\n" + " -n operation count (default 1)\n" + " -t set test list (default all)\n" + " -x exclude from test list (can be repeated)\n"); break; } @@ -964,8 +1054,8 @@ int main(int argc, char **argv) ret = 0; tstcnt += 1; errcnt += !!ret; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -981,8 +1071,8 @@ int main(int argc, char **argv) avset_ary_destroy(&setary); tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1007,12 +1097,12 @@ int main(int argc, char **argv) errcnt += !!(setary.avset_cnt != 0); errcnt += !!(setary.avset_siz != 0); tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; - /* Sanity test for _test_join() utility function. + /* Sanity test for coll_single_join(). */ do { PREAMBLE(0, tstnum, "test join (simple)"); @@ -1025,8 +1115,8 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1042,8 +1132,8 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1059,15 +1149,17 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to acquire a group ID. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root getgroup"); // cause zbcoll root (rank 0) to reject getgroup requests once - cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN, 0); // cause non-root ranks attempt zbcoll getgroup first if (frmwk_rank == 0) usleep(10000); @@ -1076,285 +1168,161 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to perform a broadcast. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root broadcast"); // cause zbcoll root (rank 0) to reject broadcast requests once - cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN, 0); jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, &setary, &joinlist, "FI_EAGAIN root bcast"); coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to perform a reduce. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root reduce"); // cause zbcoll root (rank 0) to reject join reduce once - cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN, 0); jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, &setary, &joinlist, "FI_EAGAIN root reduce"); coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; -#if 0 + /* Test failure to acquire a PTE. + */ do { - PREAMBLE(0, tstnum, "force -FI_EFAULT on PTE alloc"); + PREAMBLE(0, tstnum, "force -FI_EAVAIL on PTE alloc"); // cause zbcoll root (rank 0) to simulate PTE alloc failure - cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EFAULT); - ret = _test_join(fiaddrs, size, -FI_EAVAIL, - CXIP_PROV_ERRNO_PTE); + cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EAVAIL, + FI_CXI_ERRNO_JOIN_FAIL_PTE); + jctx = coll_single_join(fiaddrs, size, 0, 0, + 0, 0, + &setary, &joinlist, + "fail PTE alloc"); + TRACE("Aborting\n"); tstcnt += 1; + ret = 0; errcnt += !!ret; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); - } while (0); - tstnum++; -#endif - - do { - struct cxip_coll_mc *mc_obj; - struct cxip_coll_reduction *reduction; - struct cxip_coll_data coll_data; - int ret; - - PREAMBLE(0, tstnum, "test single packet send"); - // Create multicast and send packet through HWRoot - TRACE("======= %s\n", testname); - TRACE("starting join\n"); - - /* root is index 0, others are leaves */ - jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, - &setary, &joinlist, "simple"); - TRACE("completed join jctx = %p\n", jctx); - mc_obj = (struct cxip_coll_mc *)jctx->mc; - mc_obj->arm_disable = true; - mc_obj->retry_disable = true; - TRACE("S=%d rank=%d hwroot=%d\n", S, frmwk_rank, - mc_obj->hwroot_idx); - reduction = &mc_obj->reduction[0]; - coll_data.red_cnt = 1; - coll_data.intval.ival[0] = 1234; - coll_data.intval.ival[1] = frmwk_rank; - memset(&reduction->accum, 0, sizeof(reduction->accum)); - if (frmwk_rank == S) { - TRACE("test starting send on %d\n", S); - do { - ret = cxip_coll_send_red_pkt( - reduction, &coll_data, - false, false); - TRACE("send result = %d\n", ret); - } while (ret == -FI_EAGAIN); - TRACE("completed send = %d\n", ret); - } - while (1) - _poll_cqs(); - - coll_join_cleanup(&setary, &joinlist); - errcnt += !!!jctx; - tstcnt += 1; frmwk_log0("%4s\n", STDMSG(ret)); - frmwk_barrier(); } while (0); tstnum++; -/*###############################################################*/ + /* Placeholder + */ do { - uint64_t context; - - PREAMBLE(0, tstnum, "test barrier (simple)"); - // Test single join over one array list + PREAMBLE(0, tstnum, "(not implemented)"); + // Placeholder, preserve to keep other numbering the same TRACE("======= %s\n", testname); - TRACE("[%d] starting join\n", frmwk_rank); - jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, - &setary, &joinlist, "simple"); - TRACE("completed join jctx = %p\n", jctx); - TRACE("start barrier\n"); - do { - ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc, - &context); - TRACE("barrier = %d\n", ret); - } while (ret == -FI_EAGAIN); - - if (ret == FI_SUCCESS) { - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("BARRIER COMPLETE #%d\n", i); - } else { - TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); - errcnt++; - } - coll_join_cleanup(&setary, &joinlist); - errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + ret = 0; frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test opcount barriers. + */ do { - PREAMBLE(0, tstnum, "perform barrier"); - TRACE("Starting barrier\n"); - ret = _test_barrier(fiaddrs, size, 1); + PREAMBLE(0, tstnum, "perform barrier x opcount (default 1)"); + ret = _test_barrier(fiaddrs, size, opcount, &metrics); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "full:%-4ld none:%-4ld part:%-4ld bad: %-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_full, + metrics.red_count_unreduced, + metrics.red_count_partial, + metrics.red_count_bad); frmwk_barrier(); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "perform broadcast"); - for (i = 0; i < frmwk_numranks; i++) { - ret = _test_broadcast(fiaddrs, size, i); - errcnt += !!ret; - } - tstcnt += 1; frmwk_log0("%4s\n", STDMSG(ret)); - frmwk_barrier(); } while (0); tstnum++; + /* Test opcount broadcasts. + */ do { - PREAMBLE(0, tstnum, "perform allreduce sum"); - ret = _test_allreduce(fiaddrs, size); - TRACE("allreduce ret = %d\n", ret); + PREAMBLE(0, tstnum, "perform broadcast x opcount (default 1)"); + ret = _test_broadcast(fiaddrs, size, opcount, &metrics); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test opcount int64 sum reductions + */ do { - PREAMBLE(0, tstnum, "perform barrier x N"); - ret = _test_barrier(fiaddrs, size, N); + PREAMBLE(0, tstnum, "perform allreduce int64 sum x opcount (default 1)"); + ret = _test_allreduce_isum(fiaddrs, size, opcount, &metrics); + TRACE("allreduce ret = %d\n", ret); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); frmwk_barrier(); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "test mcast dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_MCAST_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } - tstcnt += 1; - - frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "test hwroot dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 1, 0, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } - tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; + /* Test opcount double sum reductions + */ do { - PREAMBLE(0, tstnum, "test hwroot and mcast dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } + PREAMBLE(0, tstnum, "perform allreduce double sum x opcount (default 1)"); + ret = _test_allreduce_dsum(fiaddrs, size, opcount, &metrics); + TRACE("allreduce ret = %d\n", ret); + errcnt += !!ret; tstcnt += 1; - + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); + frmwk_barrier(); frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; @@ -1363,18 +1331,18 @@ int main(int argc, char **argv) avset_ary_init(&setary); TRACE("avset initialized\n"); - for (i = 0; i < N; i++) { + for (i = 0; i < trees; i++) { ret = avset_ary_append(fiaddrs, size, i, i, &setary); TRACE("avset append %d = %d\n", i, ret); } dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); + ret = coll_multi_join(&setary, &joinlist, -1); TRACE("multijoin = %d\n", ret); - for (i = 0; i < N; i++) { + for (i = 0; i < trees; i++) { int exp_ret = (i < size) ? 0 : -FI_EAVAIL; - int exp_errno = (i < size) ? 0 : CXIP_PROV_ERRNO_HWROOT_INUSE; + int exp_errno = (i < size) ? 0 : FI_CXI_ERRNO_JOIN_HWROOT_INUSE; int good; jctx = _get_join_jctx(&joinlist, i); @@ -1391,108 +1359,23 @@ int main(int argc, char **argv) } tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); coll_multi_release(&joinlist); avset_ary_destroy(&setary); - } while (0); - tstnum++; - - - do { - PREAMBLE(0, tstnum, "test multiple broadcast"); - - uint64_t **datary, *ctxary, *ctxptr; - int in_progress, tree, root, i, j; - - /* set up maximum number of trees possible */ - avset_ary_init(&setary); - for (tree = 0; tree < size; tree++) { - ret = avset_ary_append(fiaddrs, size, tree, tree, &setary); - TRACE("avset append group %d = %d\n", tree, ret); - } - TRACE("avset initialized\n"); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("multijoin = %d\n", ret); - - /* context and data for each collective tree */ - ctxary = calloc(size, sizeof(uint64_t)); - datary = calloc(size, sizeof(void *)); - for (tree = 0; tree < size; tree++) { - datary[tree] = calloc(4, sizeof(uint64_t)); - ctxary[tree] = tree; - } - - /* repeat the collective N times as requested*/ - for (i = 0; i < N; i++) { - in_progress = 0; - - /* rotate root every time */ - root = i%size; - - /* start a broadcast on every tree */ - for (tree = 0; tree < size; tree++) { - uint64_t id = (uint64_t)tree << 32; - - /* prepare the data */ - memset(datary[tree], 0, 4*sizeof(uint64_t)); - if (frmwk_rank == root) { - for (j = 0; j < 4; j++) - datary[tree][j] = id|root; - } - TRACE("strt=%d tree=%d\n", i, tree); - for (j = 0; j < 4; j++) - TRACE(" %016lx\n", datary[tree][j]); - - } - for (tree = 0; tree < size; tree++) { - int tree2 = (tree + frmwk_rank)%size; - - usleep(rand() % 100); - jctx = _get_join_jctx(&joinlist, tree2); - ret = fi_broadcast(cxit_ep, datary[tree2], 4, NULL, - (fi_addr_t )jctx->mc, - fiaddrs[root], FI_UINT64, - 0L, &ctxary[tree2]); - in_progress++; - TRACE("in_progress=%d\n", in_progress); - if ((ctxptr = _poll_cqs())) { - in_progress--; - TRACE("ctxptr=%ld in_progress=%d\n", - *ctxptr, in_progress); - } - } - while (in_progress > 0) { - if ((ctxptr = _poll_cqs())) { - in_progress--; - TRACE("ctxptr=%ld in_progress=%d\n", - *ctxptr, in_progress); - } - } - for (tree = 0; tree < size; tree++) { - TRACE("rslt=%d tree=%d\n", i, tree); - for (j = 0; j < 4; j++) - TRACE(" %016lx\n", datary[tree][j]); - - } - } - tstcnt += 1; - + frmwk_barrier(); frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; + #if 0 + // template for test case // do { PREAMBLE(0, tstnum, "title of test"); ret = 0; // some test errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; #endif @@ -1501,8 +1384,11 @@ int main(int argc, char **argv) return (errcnt); done: - frmwk_log0("%2d tests run, %d failures\n", tstcnt, errcnt); - frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); + frmwk_log0("\nFinal Report =====================================\n"); + frmwk_barrier(); + frmwk_log("%2d tests skipped, %2d tests run, %d failures\n", + skpcnt, tstcnt, errcnt); + frmwk_log(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); free(fiaddrs); frmwk_free_libfabric(); frmwk_term(); diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c index 27990b9a8f8..3527c100bfa 100644 --- a/prov/cxi/test/rma.c +++ b/prov/cxi/test/rma.c @@ -14,6 +14,59 @@ #define RMA_WIN_KEY 0x1f +TestSuite(rma_no_init, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rma_no_init, xfer_disable_optimized_mrs_disable_prov_key_cache) +{ + int ret; + bool value; + uint64_t key; + struct mem_region mem_window; + size_t len = 16 * 1024; + uint8_t *send_buf; + struct fi_cq_tagged_entry cqe; + struct cxip_mr_key mr_key; + + send_buf = calloc(1, len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + ret = setenv("CXIP_TEST_PROV_KEY", "1", 1); + cr_assert_eq(ret, 0); + + cxit_setup_rma(); + + value = false; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_OPTIMIZED_MRS, &value); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + + value = false; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_PROV_KEY_CACHE, &value); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + + ret = mr_create(len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0, &key, + &mem_window); + cr_assert_eq(ret, FI_SUCCESS); + + mr_key.raw = key; + cr_assert(mr_key.opt == 0); + + ret = fi_write(cxit_ep, send_buf, len, NULL, cxit_ep_fi_addr, 0, key, + NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + mr_destroy(&mem_window); + cxit_teardown_rma(); + free(send_buf); +} + TestSuite(rma, .init = cxit_setup_rma, .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); @@ -580,9 +633,10 @@ void cxit_rma_setup_no_rma_events(void) } /* Test HRP Put */ -Test(rma_opt, hrp, +Test(rma_opt_hrp, hrp, .init = cxit_rma_setup_no_rma_events, - .fini = cxit_teardown_rma) + .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT) { int ret; uint64_t hrp_acks_start; @@ -1543,6 +1597,143 @@ Test(rma_sel, selective_completion_suppress, free(send_buf); } +Test(rma_sel, fi_more_write_stream_optimized, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int write_count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + + mr_create(0, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + rma.key = key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + write_count++; + } while (ret != -FI_EAGAIN); + + cr_assert(write_count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_cntr_wait(cntr, write_count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + +Test(rma_sel, fi_more_write_stream_mix_optimzied_unoptimized, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region opt_mem_window; + struct mem_region mem_window; + uint64_t opt_key_val = 0x0; + uint64_t key_val = 0x1234; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int write_count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + + mr_create(0, FI_REMOTE_WRITE, 0, &opt_key_val, &opt_mem_window); + mr_create(0, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + rma.key = opt_key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + write_count++; + } while (ret != -FI_EAGAIN); + + cr_assert(write_count >= cxit_fi_hints->tx_attr->size); + + rma.key = key_val; + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + write_count++; + + ret = fi_cntr_wait(cntr, write_count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); + mr_destroy(&opt_mem_window); +} + +Test(rma_sel, fi_more_read_stream, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_read_cntr; + + mr_create(0, FI_REMOTE_READ, 0, &key_val, &mem_window); + + rma.key = key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_readmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_readmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_readmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + /* Test remote counter events with RMA */ Test(rma, rem_cntr) { @@ -1796,6 +1987,102 @@ Test(rma, invalid_read_target_opt_mr_key) rma_invalid_read_target_mr_key(0x10); } +/* Tests to verify FI_RM_ENABLED */ + +static void mr_overrun(bool write, bool use_cq) +{ + int ret; + uint8_t *local; + size_t good_len = 4096; + uint64_t key_val = 0xa; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + + /* Create over-sized local buffer */ + local = calloc(1, good_len * 2); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(good_len, write ? FI_REMOTE_WRITE : FI_REMOTE_READ, 0xc0, + &key_val, &remote); + + /* Perform good length data transfer first */ + if(write) { + ret = fi_write(cxit_ep, local, good_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write() failed (%d)", ret); + } + else { + ret = fi_read(cxit_ep, local, good_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + } + + if (use_cq) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | (write ? FI_WRITE : FI_READ), NULL); + } else { + while (fi_cntr_read(write ? cxit_write_cntr : cxit_read_cntr) != 1) + ; + } + + /* Validate read data */ + for (int i = 0; i < good_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + /* Perform overrun data transfer */ + if (write) { + ret = fi_write(cxit_ep, local, good_len*2, NULL, cxit_ep_fi_addr, + 0, key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write() failed (%d)", ret); + } + else { + ret = fi_read(cxit_ep, local, good_len*2, NULL, cxit_ep_fi_addr, + 0, key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + } + + if (use_cq) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + } else { + while (fi_cntr_readerr(write ? cxit_write_cntr : cxit_read_cntr) != 1) + ; + } + + mr_destroy(&remote); + free(local); +} + +Test(rma, read_mr_overrun_cq) +{ + mr_overrun(false, true); +} + +Test(rma, write_mr_overrun_cq) +{ + mr_overrun(true, true); +} + +Test(rma, read_mr_overrun_cntr) +{ + mr_overrun(false, false); +} + +Test(rma, write_mr_overrun_cntr) +{ + mr_overrun(true, false); +} + static void rma_hybrid_mr_desc_test_runner(bool write, bool cq_events) { struct mem_region source_window; diff --git a/prov/cxi/test/rocr.c b/prov/cxi/test/rocr.c index 3d9567e133e..3328d4bb103 100644 --- a/prov/cxi/test/rocr.c +++ b/prov/cxi/test/rocr.c @@ -761,3 +761,85 @@ Test(hsa, verify_hmemDevReg_fine) verify_dev_reg_handle(true, FINE); } + +Test(hsa, dmabuf_offset) +{ + hsa_status_t hsa_ret; + void *bufs[2]; + int ret; + int i; + struct fid_mr *mrs[2]; + size_t size = 1024 * 1024; + + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[0]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + ret = fi_mr_reg(cxit_domain, bufs[0], size, FI_READ | FI_WRITE, 0, 0, 0, + &mrs[0], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[1]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + ret = fi_mr_reg(cxit_domain, bufs[1], size, FI_READ | FI_WRITE, 0, 0, 0, + &mrs[1], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + for (i = 0; i < 2; i++) { + ret = fi_close(&(mrs[i]->fid)); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + hsa_ret = hsa_memory_free(bufs[i]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); + } + + cxit_teardown_msg(); +} + +Test(hsa, dmabuf_stress) +{ + hsa_status_t hsa_ret; + int ret; + int i; + void *buf; + size_t size = 1024 * 1024; + struct fid_mr *mr; + + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + cxit_setup_msg(); + + for (i = 0; i < 2048; i++) { + ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE, + 0, 0, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_close(&mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + } + + cxit_teardown_msg(); + + hsa_ret = hsa_memory_free(buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); +} diff --git a/prov/cxi/test/startvm-setup.sh b/prov/cxi/test/startvm-setup.sh index 0e79f611a86..ea660279f03 100755 --- a/prov/cxi/test/startvm-setup.sh +++ b/prov/cxi/test/startvm-setup.sh @@ -17,7 +17,7 @@ modprobe ptp modprobe iommu_v2 || modprobe amd_iommu_v2 insmod $DBS_DIR/slingshot_base_link/cxi-sbl.ko insmod $DBS_DIR/sl-driver/knl/cxi-sl.ko -insmod $DBS_DIR/cxi-driver/cxi/cxi-core.ko disable_default_svc=0 +insmod $DBS_DIR/cxi-driver/cxi/cxi-ss1.ko disable_default_svc=0 insmod $DBS_DIR/cxi-driver/cxi/cxi-user.ko insmod $DBS_DIR/cxi-driver/cxi/cxi-eth.ko insmod $DBS_DIR/kdreg2/kdreg2.ko diff --git a/prov/cxi/test/startvm.sh b/prov/cxi/test/startvm.sh index 933bd082fed..97271732d4d 100755 --- a/prov/cxi/test/startvm.sh +++ b/prov/cxi/test/startvm.sh @@ -67,7 +67,7 @@ else DEVICE=$(cat /sys/class/cxi/cxi0/device/virtfn0/device) # Unbind VF from cxi core driver. cxi1 no longer exists - echo $PCIFN > /sys/bus/pci/drivers/cxi_core/unbind + echo $PCIFN > /sys/bus/pci/drivers/cxi_ss1/unbind # Bind the VF to vfio driver modprobe vfio_pci diff --git a/prov/cxi/test/tagged.c b/prov/cxi/test/tagged.c index e711767f308..b486a340f60 100644 --- a/prov/cxi/test/tagged.c +++ b/prov/cxi/test/tagged.c @@ -5475,7 +5475,262 @@ Test(tagged_src_err, addr) TestSuite(tagged_cq_wait, .init = cxit_setup_rma_fd, .fini = cxit_teardown_rma_fd, - .timeout = CXIT_DEFAULT_TIMEOUT); + .timeout = 20); + +Test(tagged_cq_wait, timeout_poll) +{ + struct fid *fids[1]; + int cq_fd; + int ret; + struct pollfd fds; + int timeout = 100; + uint64_t end_ms; + uint64_t start_ms; + + sleep(1); + + ret = fi_control(&cxit_rx_cq->fid, FI_GETWAIT, &cq_fd); + cr_assert_eq(ret, FI_SUCCESS, "Get RX CQ wait FD %d", ret); + + fids[0] = &cxit_rx_cq->fid; + ret = fi_trywait(cxit_fabric, fids, 1); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected fi_trywait return %d\n", + ret); + + fds.fd = cq_fd; + fds.events = POLLIN; + start_ms = ofi_gettime_ms(); + ret = poll(&fds, 1, timeout); + cr_assert_eq(ret, 0, "Poll did not timed out, %d", ret); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); +} + +Test(tagged_cq_wait, timeout_epoll) +{ + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int ret; + int epfd; + int waitfd; + struct fid *fids[1]; + int timeout = 100; + uint64_t end_ms; + uint64_t start_ms; + + sleep(1); + + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s\n", + strerror(errno)); + + ret = fi_control(&cxit_tx_cq->fid, FI_GETWAIT, &waitfd); + cr_assert(ret == FI_SUCCESS, "get FD for wait object failed %s\n", + strerror(errno)); + + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, waitfd, &ev); + cr_assert(ret == 0, "epoll_ctl failed %s\n", strerror(errno)); + + fids[0] = &cxit_tx_cq->fid; + ret = fi_trywait(cxit_fabric, fids, 1); + cr_assert(ret == FI_SUCCESS, "fi_trywait failed %s\n", + fi_strerror(-ret)); + + /* Ensure timeout since events should not be outsanding */ + memset(&ev, 0, sizeof(ev)); + start_ms = ofi_gettime_ms(); + ret = epoll_wait(epfd, &ev, 1, timeout); + cr_assert(ret == 0, "epoll_wait did not timeout\n"); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); + + close(epfd); +} + +Test(tagged_cq_wait, timeout_sread) +{ + int ret; + int timeout = 100; + struct fi_cq_tagged_entry rx_cqe; + uint64_t end_ms; + uint64_t start_ms = ofi_gettime_ms(); + + /* No events should be available. Timeout returns -FI_EAGAIN. */ + ret = fi_cq_sread(cxit_rx_cq, &rx_cqe, 1, NULL, timeout); + cr_assert_eq(ret, -FI_EAGAIN, "Poll did not timed out, %s", + fi_strerror(ret)); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); +} + +struct simple_rx_wait { + bool epoll; + bool ux_msg; +}; + +static void *simple_rx_worker(void *data) +{ + struct simple_rx_wait *arg = (struct simple_rx_wait *) data; + struct fid *fids[1]; + int ret; + int recv_len = 64; + uint8_t *recv_buf; + struct fi_cq_tagged_entry rx_cqe; + fi_addr_t from; + int cq_fd; + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int epfd; + struct pollfd fds; + int tries = 0; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + ret = fi_control(&cxit_rx_cq->fid, FI_GETWAIT, &cq_fd); + cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", cq_fd); + + fids[0] = &cxit_rx_cq->fid; + + /* We want to block waiting for the recv event */ + if (arg->epoll) { + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s", + strerror(errno)); + + ev.data.fd = cq_fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cq_fd, &ev); + cr_assert_eq(ret, 0, "epoll_ctl() failed %s", strerror(errno)); + } + + /* For UX message tests, trywait should return -FI_EAGAIN */ +cqe_not_ready: + ret = fi_trywait(cxit_fabric, fids, 1); + if (arg->ux_msg) { + cr_assert_eq(ret, -FI_EAGAIN, "UX event not ready, ret %s\n", + fi_strerror(-ret)); + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "UX message not received\n"); + goto done; + } + + /* No event should be pending, nothing sent yet */ + if (tries == 0) + cr_assert_eq(ret, FI_SUCCESS, "RX CQ event pending ret %d", ret); + + /* Wait for message */ + if (ret == FI_SUCCESS) { + if (arg->epoll) { + struct epoll_event evs[1] = {}; + + ret = epoll_wait(epfd, evs, 1, 5000); + } else { + fds.fd = cq_fd; + fds.events = POLLIN; + ret = poll(&fds, 1, 5000); + } + cr_assert(ret != 0, "RX poll timed out, ret %d\n", ret); + cr_assert(ret > 0, "Unexpected poll error %d\n", ret); + } + + /* We can get woken up for the send event, so -FI_EAGAIN + * is possible. Make sure no more than two wakeups occur. + */ + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == -FI_EAGAIN && ++tries < 2) + goto cqe_not_ready; + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + +done: + free(recv_buf); + pthread_exit(NULL); +} + +void simple_rx_wait(bool epoll, bool ux_msg) +{ + pthread_t rx_thread; + pthread_attr_t attr = {}; + int ret; + int i; + int send_len = 64; + uint8_t *send_buf; + struct fi_cq_tagged_entry tx_cqe; + struct simple_rx_wait arg = { + .epoll = epoll, + .ux_msg = ux_msg, + }; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + if (!arg.ux_msg) { + /* Start processing receives */ + ret = pthread_create(&rx_thread, &attr, simple_rx_worker, &arg); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Make sure receive is posted and thread is polling */ + sleep(1); + } + + /* Send 64 byte message to self */ + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + if (arg.ux_msg) { + /* Start processing receives */ + ret = pthread_create(&rx_thread, &attr, simple_rx_worker, &arg); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + } + + ret = pthread_join(rx_thread, NULL); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + free(send_buf); +} + +Test(tagged_cq_wait, simple_rx_epoll) +{ + simple_rx_wait(true, false); +} + +Test(tagged_cq_wait, simple_rx_epoll_ux) +{ + simple_rx_wait(true, true); +} + +Test(tagged_cq_wait, simple_rx_poll) +{ + simple_rx_wait(false, false); +} + +Test(tagged_cq_wait, simple_rx_poll_ux) +{ + simple_rx_wait(false, true); +} struct fd_params { size_t length; @@ -5500,36 +5755,54 @@ static void *tagged_cq_wait_evt_worker(void *data) struct fid *fids[1]; int cq_fd; size_t completions = 0; + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int epfd; args = (struct tagged_cq_wait_event_args *)data; if (args->poll) { + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s", + strerror(errno)); + ret = fi_control(&args->cq->fid, FI_GETWAIT, &cq_fd); cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", ret); - fids[0] = &args->cq->fid; + + ev.data.fd = cq_fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cq_fd, &ev); + cr_assert_eq(ret, 0, "epoll_ctl() failed %s", + strerror(errno)); } while (completions < args->io_num) { if (args->poll) { + fids[0] = &args->cq->fid; ret = fi_trywait(cxit_fabric, fids, 1); if (ret == FI_SUCCESS) { - struct pollfd fds; - - fds.fd = cq_fd; - fds.events = POLLIN; + struct epoll_event evs[1] = {}; - ret = poll(&fds, 1, args->timeout); - cr_assert_neq(ret, 0, "Poll timed out"); + ret = epoll_wait(epfd, evs, 1, args->timeout); + cr_assert_neq(ret, 0, "%s CQ poll timed out", + args->cq == cxit_tx_cq ? + "TX" : "RX"); cr_assert_eq(ret, 1, "Poll error"); } + ret = fi_cq_read(args->cq, &args->cqe[completions], 1); if (ret == 1) completions++; + + sched_yield(); } else { ret = fi_cq_sread(args->cq, &args->cqe[completions], 1, NULL, args->timeout); - cr_assert_eq(ret, 1, "Completion not received\n"); + cr_assert_eq(ret, 1, + "%s completion not received ret %d\n", + args->cq == cxit_tx_cq ? "TX" : "RX", ret); completions++; } } @@ -5577,7 +5850,7 @@ void do_cq_wait(struct fd_params *param) struct tagged_thread_args *rx_args; pthread_t tx_thread; pthread_t rx_thread; - pthread_attr_t attr; + pthread_attr_t attr = {}; struct tagged_cq_wait_event_args tx_evt_args = { .cq = cxit_tx_cq, .io_num = param->num_ios, @@ -5650,14 +5923,14 @@ void do_cq_wait(struct fd_params *param) /* Sends last for expected messaging */ if (!param->ux_msg) { - /* Make sure receive has blocked */ + /* Make RX process first */ sleep(1); - cq_wait_post_sends(tx_args, param); /* Start processing Send events */ ret = pthread_create(&tx_thread, &attr, tagged_cq_wait_evt_worker, (void *)&tx_evt_args); + cq_wait_post_sends(tx_args, param); } /* Wait for the RX/TX event threads to complete */ @@ -5689,11 +5962,13 @@ void do_cq_wait(struct fd_params *param) free(rx_args); } +/* Test multiple threads using poll or sread on both CQ */ ParameterizedTestParameters(tagged_cq_wait, wait_fd) { size_t param_sz; static struct fd_params params[] = { + /* Test direct FI_WAIT_FD polling */ {.length = 1024, .num_ios = 4, .timeout = 5000, @@ -5702,6 +5977,7 @@ ParameterizedTestParameters(tagged_cq_wait, wait_fd) .num_ios = 4, .timeout = 5000, .poll = true}, + /* Test indirect FI_WAIT_FD polling via fi_cq_sread */ {.length = 1024, .num_ios = 4, .timeout = 5000, diff --git a/prov/cxi/test/test.sh b/prov/cxi/test/test.sh old mode 100644 new mode 100755 index ea6a913703f..21914365f87 --- a/prov/cxi/test/test.sh +++ b/prov/cxi/test/test.sh @@ -149,6 +149,8 @@ fork_safe_kdreg2_test=( unlimited_triggered_ops_test=( "FI_CXI_ENABLE_TRIG_OP_LIMIT=0 ./cxitest -j 1 --verbose --filter=\"deferred_work_trig_op_limit/*\" --tap=cxitest-disable-trig-op-limit.tap") +mr_cache_test=("./cxitest --verbose --tap=cxitest-mr_cache_test.tap --filter=\"mr_cache/*\" -j 1") + long_test_suite=( "basic_test" "swget_test" @@ -174,6 +176,7 @@ long_test_suite=( "fork_safe_memhooks_test" "fork_safe_kdreg2_test" "unlimited_triggered_ops_test" + "mr_cache_test" ) # ################################################################ diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 4963fe404e3..a5c2842d389 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -47,10 +47,10 @@ _efa_files = \ prov/efa/src/efa_prov.c \ prov/efa/src/efa_env.c \ prov/efa/src/efa_cntr.c \ - prov/efa/src/dgram/efa_dgram_ep.c \ - prov/efa/src/dgram/efa_dgram_cq.c \ - prov/efa/src/dgram/efa_dgram_msg.c \ - prov/efa/src/dgram/efa_dgram_rma.c \ + prov/efa/src/efa_msg.c \ + prov/efa/src/efa_rma.c \ + prov/efa/src/efa_cq.c \ + prov/efa/src/efa_ep.c \ prov/efa/src/rdm/efa_rdm_peer.c \ prov/efa/src/rdm/efa_rdm_cq.c \ prov/efa/src/rdm/efa_rdm_ep_utils.c \ @@ -94,8 +94,6 @@ _efa_headers = \ prov/efa/src/efa_prov.h \ prov/efa/src/efa_env.h \ prov/efa/src/fi_ext_efa.h \ - prov/efa/src/dgram/efa_dgram_ep.h \ - prov/efa/src/dgram/efa_dgram_cq.h \ prov/efa/src/rdm/efa_rdm_peer.h \ prov/efa/src/rdm/efa_rdm_cq.h \ prov/efa/src/rdm/efa_rdm_ep.h \ @@ -148,7 +146,9 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_runt.c \ prov/efa/test/efa_unit_test_mr.c \ prov/efa/test/efa_unit_test_rdm_peer.c \ - prov/efa/test/efa_unit_test_pke.c + prov/efa/test/efa_unit_test_pke.c \ + prov/efa/test/efa_unit_test_msg.c \ + prov/efa/test/efa_unit_test_rma.c efa_CPPFLAGS += -I$(top_srcdir)/include -I$(top_srcdir)/prov/efa/test $(cmocka_CPPFLAGS) @@ -161,7 +161,8 @@ prov_efa_test_efa_unit_test_LDFLAGS = $(cmocka_rpath) $(efa_LDFLAGS) $(cmocka_LD -Wl,--wrap=efadv_query_device \ -Wl,--wrap=ofi_cudaMalloc \ -Wl,--wrap=ofi_copy_from_hmem_iov \ - -Wl,--wrap=efa_rdm_pke_read + -Wl,--wrap=efa_rdm_pke_read \ + -Wl,--wrap=efa_device_support_unsolicited_write_recv if HAVE_EFADV_CQ_EX prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=efadv_create_cq @@ -185,7 +186,6 @@ endif ENABLE_EFA_UNIT_TEST efa_CPPFLAGS += \ -I$(top_srcdir)/prov/efa/src/ \ - -I$(top_srcdir)/prov/efa/src/dgram/ \ -I$(top_srcdir)/prov/efa/src/rdm/ rdmainclude_HEADERS += \ diff --git a/prov/efa/configure.m4 b/prov/efa/configure.m4 index f807ce9bc51..71152f72ed4 100644 --- a/prov/efa/configure.m4 +++ b/prov/efa/configure.m4 @@ -77,6 +77,7 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ efadv_support_extended_cq=0 have_efa_dmabuf_mr=0 have_efadv_query_mr=0 + have_efadv_sl=0 dnl $have_neuron is defined at top-level configure.ac AM_CONDITIONAL([HAVE_NEURON], [ test x"$have_neuron" = x1 ]) @@ -159,6 +160,11 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ [], [have_efadv_query_mr=0], [[#include ]]) + + AC_CHECK_MEMBER(struct efadv_qp_init_attr.sl, + [have_efadv_sl=1], + [have_efadv_sl=0], + [[#include ]]) ]) AC_DEFINE_UNQUOTED([HAVE_RDMA_SIZE], @@ -188,6 +194,9 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ AC_DEFINE_UNQUOTED([HAVE_EFADV_QUERY_MR], [$have_efadv_query_mr], [Indicates if efadv_query_mr verbs is available]) + AC_DEFINE_UNQUOTED([HAVE_EFADV_SL], + [$have_efadv_sl], + [Indicates if efadv_qp_init_attr has sl]) CPPFLAGS=$save_CPPFLAGS diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9f0b457a1bf..1877156779b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -68,6 +68,12 @@ Chapter 4 "extra features/requests" describes the extra features/requests define * Section 4.6 describe the extra feature: RDMA-Write based message transfer. + * Section 4.7 describe the extra feature: Long read and runting read nack protocol. + + * Section 4.8 describe the extra feature: User receive QP. + + * Section 4.9 describe the extra feature: Unsolicited write recv. + Chapter 5 "What's not covered?" describes the contents that are intentionally left out of this document because they are considered "implementation details". @@ -323,6 +329,7 @@ Table: 2.1 a list of extra features/requests | 5 | RDMA-Write based data transfer | extra feature | libfabric 1.18.0 | Section 4.6 | | 6 | Read nack packets | extra feature | libfabric 1.20.0 | Section 4.7 | | 7 | User recv QP | extra feature & request| libfabric 1.22.0 | Section 4.8 | +| 8 | Unsolicited write recv | extra feature | libfabric 1.22.0 | Section 4.9 | How does protocol v4 maintain backward compatibility when extra features/requests are introduced? @@ -414,7 +421,7 @@ Note, the field `extra_info` was named `features` when protocol v4 was initially only planned for extra features. Later, we discovered that the handshake subprotocol can also be used to pass additional request information, thus introduced the concept of "extra request" and renamed this field `extra_info`. -`nextra_p3` is number of `extra_info` flags of the endpoint plus 3. The "plus 3" is for historical reasons. +`nextra_p3` is number of 64-bit `extra_info` elements of the endpoint plus 3. The "plus 3" is for historical reasons. When protocol v4 was initially introduced, this field is named `maxproto`. The original plan was that protocol v4 can only have 64 extra features/requests. If the number of extra feature/request ever exceeds 64, the next feature/request will be defined as version 5 feature/request, (version 6 if the number exceeds 128, so on so @@ -1505,8 +1512,9 @@ in order to support CQ entry generation in case the sender uses ### 4.7 Long read and runting read nack protocol Long read and runting read protocols in Libfabric 1.20 and above use a nack protocol -when the receiver is unable to register a memory region for the RDMA read operation. -Failure to register the memory region is typically because of a hardware limitation. +when the receiver is unable to register a memory region for the RDMA read operation +or P2P support is unavailable for the RDMA read operation, typically because of a +hardware limitation. Table: 4.2 Format of the READ_NACK packet @@ -1521,12 +1529,14 @@ Table: 4.2 Format of the READ_NACK packet The nack protocols work as follows * Sender has decided to use the long read or runting read protocol -* The receiver receives the RTM packet(s) +* The receiver receives the RTM packet(s) or RTW packet - One LONGREAD_RTM packet in case of long read protocol - Multiple RUNTREAD_RTM packets in case of runting read protocol -* The receiver attempts to register a memory region for the RDMA operation but fails -* After all RTM packets have been processed, the receiver sends a READ_NACK packet to the sender -* The sender then switches to the long CTS protocol and sends a LONGCTS_RTM packet + - One LONGREAD_RTW packet in case of emulated long-read write protocol +* The receiver attempts to register a memory region for the RDMA operation but fails, +or P2P is unavailable for the RDMA operation +* After all RTM/RTW packets have been processed, the receiver sends a READ_NACK packet to the sender +* The sender then switches to the long CTS protocol and sends a LONGCTS_RTM/LONGCTS_RTW packet * The receiver sends a CTS packet and the data transfer continues as in the long CTS protocol The LONGCTS_RTM packet sent in the nack protocol does not contain any application data. @@ -1608,6 +1618,17 @@ zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. +### 4.9 Unsolicited write recv + +The "Unsolicited write recv" is an extra feature that was +introduced with the libfabric 1.22.0. When this feature is on, rdma-write +with immediate data will not consume an rx buffer on the responder side. It is +defined as an extra feature because there is a set of requirements (firmware, +EFA kernel module and rdma-core) to be met before an endpoint can use the unsolicited +write recv capability, therefore an endpoint cannot assume the other party supports +unsolicited write recv. The rdma-write with immediate data cannot be issued if there +is a discrepancy on this feature between local and peer. + ## 5. What's not covered? The purpose of this document is to define the communication protocol. Therefore, it is intentionally written diff --git a/prov/efa/src/dgram/efa_dgram_cq.c b/prov/efa/src/dgram/efa_dgram_cq.c deleted file mode 100644 index d046549bd66..00000000000 --- a/prov/efa/src/dgram/efa_dgram_cq.c +++ /dev/null @@ -1,339 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include -#include -#include "config.h" -#include -#include "dgram/efa_dgram_ep.h" -#include "efa.h" -#include "efa_cq.h" -#include "efa_av.h" -#include "efa_dgram_cq.h" -#include - -struct efa_wc { - struct ibv_wc ibv_wc; - /* Source address */ - uint16_t efa_ah; -}; - -struct efa_wce { - struct slist_entry entry; - struct efa_wc wc; -}; - -#define EFA_WCE_CNT 1024 - -static inline uint64_t efa_dgram_cq_opcode_to_fi_flags(enum ibv_wc_opcode opcode) { - switch (opcode) { - case IBV_WC_SEND: - return FI_SEND | FI_MSG; - case IBV_WC_RECV: - return FI_RECV | FI_MSG; - default: - assert(0); - return 0; - } -} - -static inline uint32_t efa_dgram_cq_api_version(struct efa_dgram_cq *cq) { - return cq->domain->fabric->util_fabric.fabric_fid.api_version; -} - -ssize_t efa_dgram_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, - uint64_t flags) -{ - struct efa_dgram_cq *cq; - uint32_t api_version; - - cq = container_of(cq_fid, struct efa_dgram_cq, util_cq.cq_fid); - - ofi_spin_lock(&cq->lock); - - if (!cq->ibv_cq_ex->status) - goto err; - - api_version = efa_dgram_cq_api_version(cq); - - entry->op_context = (void *)(uintptr_t)cq->ibv_cq_ex->wr_id; - entry->flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(cq->ibv_cq_ex)); - entry->err = FI_EIO; - entry->prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq_ex); - EFA_WARN(FI_LOG_CQ, "Work completion status: %s\n", efa_strerror(entry->prov_errno)); - - ofi_spin_unlock(&cq->lock); - - /* We currently don't have err_data to give back to the user. */ - if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) - entry->err_data_size = 0; - - return sizeof(*entry); -err: - ofi_spin_unlock(&cq->lock); - return -FI_EAGAIN; -} - -static void efa_dgram_cq_read_context_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_entry *entry = buf; - - entry[i].op_context = (void *)ibv_cqx->wr_id; -} - -static void efa_dgram_cq_read_msg_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_msg_entry *entry = buf; - - entry[i].op_context = (void *)(uintptr_t)ibv_cqx->wr_id; - entry[i].flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); - entry[i].len = ibv_wc_read_byte_len(ibv_cqx); -} - -static void efa_dgram_cq_read_data_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_data_entry *entry = buf; - - entry[i].op_context = (void *)ibv_cqx->wr_id; - entry[i].flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); - entry[i].data = 0; - entry[i].len = ibv_wc_read_byte_len(ibv_cqx); -} - -/** - * @brief Convert an error code from CQ poll API, e.g. `ibv_start_poll`, `ibv_end_poll`. - * The returned error code must be 0 (success) or negative (error). - * As a special case, if input error code is ENOENT (there was no item on CQ), we should return -FI_EAGAIN. - * @param[in] err Return value from `ibv_start_poll` or `ibv_end_poll` - * @returns Converted error code - */ -static inline ssize_t efa_dgram_cq_ibv_poll_error_to_fi_error(ssize_t err) { - if (err == ENOENT) { - return -FI_EAGAIN; - } - - if (err > 0) { - return -err; - } - - return err; -} - -ssize_t efa_dgram_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, - fi_addr_t *src_addr) -{ - bool should_end_poll = false; - struct efa_dgram_cq *cq; - struct efa_av *av; - ssize_t err = 0; - size_t num_cqe = 0; /* Count of read entries */ - uint32_t qp_num, src_qp, slid; - - /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. - * EFA expects .comp_mask = 0, or otherwise returns EINVAL. - */ - struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; - - cq = container_of(cq_fid, struct efa_dgram_cq, util_cq.cq_fid); - - ofi_spin_lock(&cq->lock); - - /* Call ibv_start_poll only once regardless of count == 0 */ - err = ibv_start_poll(cq->ibv_cq_ex, &poll_cq_attr); - should_end_poll = !err; - - while (!err && num_cqe < count) { - if (cq->ibv_cq_ex->status) { - err = -FI_EAVAIL; - break; - } - - if (src_addr) { - qp_num = ibv_wc_read_qp_num(cq->ibv_cq_ex); - src_qp = ibv_wc_read_src_qp(cq->ibv_cq_ex); - slid = ibv_wc_read_slid(cq->ibv_cq_ex); - av = cq->domain->qp_table[qp_num & cq->domain->qp_table_sz_m1]->base_ep->av; - - src_addr[num_cqe] = efa_av_reverse_lookup_dgram(av, slid, src_qp); - } - - cq->read_entry(cq->ibv_cq_ex, num_cqe, buf); - num_cqe++; - - err = ibv_next_poll(cq->ibv_cq_ex); - } - - err = efa_dgram_cq_ibv_poll_error_to_fi_error(err); - - if (should_end_poll) - ibv_end_poll(cq->ibv_cq_ex); - - ofi_spin_unlock(&cq->lock); - - return num_cqe ? num_cqe : err; -} - -static const char *efa_dgram_cq_strerror(struct fid_cq *cq_fid, - int prov_errno, - const void *err_data, - char *buf, size_t len) -{ - return err_data - ? (const char *) err_data - : efa_strerror(prov_errno); -} - -static struct fi_ops_cq efa_dgram_cq_ops = { - .size = sizeof(struct fi_ops_cq), - .read = ofi_cq_read, - .readfrom = ofi_cq_readfrom, - .readerr = ofi_cq_readerr, - .sread = fi_no_cq_sread, - .sreadfrom = fi_no_cq_sreadfrom, - .signal = fi_no_cq_signal, - .strerror = efa_dgram_cq_strerror -}; - -static int efa_dgram_cq_control(fid_t fid, int command, void *arg) -{ - int ret = 0; - - switch (command) { - default: - ret = -FI_ENOSYS; - break; - } - - return ret; -} - -static int efa_dgram_cq_close(fid_t fid) -{ - struct efa_dgram_cq *cq; - int ret; - - cq = container_of(fid, struct efa_dgram_cq, util_cq.cq_fid.fid); - - ofi_bufpool_destroy(cq->wce_pool); - - ofi_spin_destroy(&cq->lock); - - ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq_ex)); - if (ret) - return ret; - - ret = ofi_cq_cleanup(&cq->util_cq); - if (ret) - return ret; - - free(cq); - - return 0; -} - -static struct fi_ops efa_dgram_cq_fi_ops = { - .size = sizeof(struct fi_ops), - .close = efa_dgram_cq_close, - .bind = fi_no_bind, - .control = efa_dgram_cq_control, - .ops_open = fi_no_ops_open, -}; - -/** - * @brief Create and set cq->ibv_cq_ex - * - * @param[in] cq Pointer to the efa_dgram_cq. cq->ibv_cq_ex must be NULL. - * @param[in] attr Pointer to fi_cq_attr. - * @param[out] Return code = 0 if successful, or negative otherwise. - */ -static inline int efa_dgram_cq_set_ibv_cq_ex(struct efa_dgram_cq *cq, struct fi_cq_attr *attr) -{ - enum ibv_cq_ex_type ibv_cq_ex_type; - - if (cq->ibv_cq_ex) { - EFA_WARN(FI_LOG_CQ, "CQ already has attached ibv_cq_ex\n"); - return -FI_EALREADY; - } - - return efa_cq_ibv_cq_ex_open(attr, cq->domain->device->ibv_ctx, - &cq->ibv_cq_ex, &ibv_cq_ex_type); -} - -int efa_dgram_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, - struct fid_cq **cq_fid, void *context) -{ - struct efa_dgram_cq *cq; - int err; - - if (attr->wait_obj != FI_WAIT_NONE) - return -FI_ENOSYS; - - cq = calloc(1, sizeof(*cq)); - if (!cq) - return -FI_ENOMEM; - - err = ofi_cq_init(&efa_prov, domain_fid, attr, &cq->util_cq, - &ofi_cq_progress, context); - if (err) { - EFA_WARN(FI_LOG_CQ, "Unable to create UTIL_CQ\n"); - goto err_free_cq; - } - - cq->domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - - err = efa_dgram_cq_set_ibv_cq_ex(cq, attr); - if (err) { - EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ\n"); - err = -FI_EINVAL; - goto err_free_util_cq; - } - - err = ofi_bufpool_create(&cq->wce_pool, sizeof(struct efa_wce), 16, 0, - EFA_WCE_CNT, 0); - if (err) { - EFA_WARN(FI_LOG_CQ, "Failed to create wce_pool\n"); - goto err_destroy_cq; - } - - switch (attr->format) { - case FI_CQ_FORMAT_UNSPEC: - case FI_CQ_FORMAT_CONTEXT: - cq->read_entry = efa_dgram_cq_read_context_entry; - cq->entry_size = sizeof(struct fi_cq_entry); - break; - case FI_CQ_FORMAT_MSG: - cq->read_entry = efa_dgram_cq_read_msg_entry; - cq->entry_size = sizeof(struct fi_cq_msg_entry); - break; - case FI_CQ_FORMAT_DATA: - cq->read_entry = efa_dgram_cq_read_data_entry; - cq->entry_size = sizeof(struct fi_cq_data_entry); - break; - case FI_CQ_FORMAT_TAGGED: - default: - err = -FI_ENOSYS; - goto err_destroy_pool; - } - - ofi_spin_init(&cq->lock); - - *cq_fid = &cq->util_cq.cq_fid; - (*cq_fid)->fid.fclass = FI_CLASS_CQ; - (*cq_fid)->fid.context = context; - (*cq_fid)->fid.ops = &efa_dgram_cq_fi_ops; - (*cq_fid)->ops = &efa_dgram_cq_ops; - - return 0; - -err_destroy_pool: - ofi_bufpool_destroy(cq->wce_pool); -err_destroy_cq: - ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq_ex)); -err_free_util_cq: - ofi_cq_cleanup(&cq->util_cq); -err_free_cq: - free(cq); - return err; -} diff --git a/prov/efa/src/dgram/efa_dgram_cq.h b/prov/efa/src/dgram/efa_dgram_cq.h deleted file mode 100644 index fbb986d3f72..00000000000 --- a/prov/efa/src/dgram/efa_dgram_cq.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#ifndef EFA_DGRAM_CQ_H -#define EFA_DGRAM_CQ_H - -typedef void (*efa_dgram_cq_read_entry)(struct ibv_cq_ex *ibv_cqx, int index, void *buf); - -struct efa_dgram_cq { - struct util_cq util_cq; - struct efa_domain *domain; - size_t entry_size; - efa_dgram_cq_read_entry read_entry; - ofi_spin_t lock; - struct ofi_bufpool *wce_pool; - uint32_t flags; /* User defined capability mask */ - - struct ibv_cq_ex *ibv_cq_ex; -}; - -int efa_dgram_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, - struct fid_cq **cq_fid, void *context); - -ssize_t efa_dgram_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr); - -ssize_t efa_dgram_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags); - -#endif \ No newline at end of file diff --git a/prov/efa/src/dgram/efa_dgram_ep.c b/prov/efa/src/dgram/efa_dgram_ep.c deleted file mode 100644 index 4f43807035a..00000000000 --- a/prov/efa/src/dgram/efa_dgram_ep.c +++ /dev/null @@ -1,476 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include "config.h" -#include "efa_dgram_ep.h" -#include "efa_dgram_cq.h" -#include "efa.h" -#include "efa_av.h" - -#include -#define efa_dgram_cq_PROGRESS_ENTRIES 500 - -static int efa_dgram_ep_getopt(fid_t fid, int level, int optname, - void *optval, size_t *optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -static int efa_dgram_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) -{ - switch (level) { - case FI_OPT_ENDPOINT: - return -FI_ENOPROTOOPT; - default: - return -FI_ENOPROTOOPT; - } - return 0; -} - -static struct fi_ops_ep efa_dgram_ep_base_ops = { - .size = sizeof(struct fi_ops_ep), - .cancel = fi_no_cancel, - .getopt = efa_dgram_ep_getopt, - .setopt = efa_dgram_ep_setopt, - .tx_ctx = fi_no_tx_ctx, - .rx_ctx = fi_no_rx_ctx, - .rx_size_left = fi_no_rx_size_left, - .tx_size_left = fi_no_tx_size_left, -}; - -static void efa_dgram_ep_destroy(struct efa_dgram_ep *ep) -{ - int ret; - - ret = efa_base_ep_destruct(&ep->base_ep); - if (ret) { - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); - } - - free(ep); -} - -static int efa_dgram_ep_close(fid_t fid) -{ - struct efa_dgram_ep *ep; - - ep = container_of(fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid.fid); - - ofi_bufpool_destroy(ep->recv_wr_pool); - ofi_bufpool_destroy(ep->send_wr_pool); - efa_dgram_ep_destroy(ep); - - return 0; -} - -static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct efa_dgram_ep *ep; - struct efa_dgram_cq *cq; - struct efa_av *av; - struct util_eq *eq; - struct util_cntr *cntr; - int ret; - - ep = container_of(fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid.fid); - ret = ofi_ep_bind_valid(&efa_prov, bfid, flags); - if (ret) - return ret; - - switch (bfid->fclass) { - case FI_CLASS_CQ: - if (flags & FI_SELECTIVE_COMPLETION) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint cannot be bound with selective completion.\n"); - return -FI_EBADFLAGS; - } - - /* Must bind a CQ to either RECV or SEND completions */ - if (!(flags & (FI_RECV | FI_TRANSMIT))) - return -FI_EBADFLAGS; - - cq = container_of(bfid, struct efa_dgram_cq, util_cq.cq_fid); - if (ep->base_ep.domain != cq->domain) - return -FI_EINVAL; - - ret = ofi_ep_bind_cq(&ep->base_ep.util_ep, &cq->util_cq, flags); - if (ret) - return ret; - - if (flags & FI_RECV) { - if (ep->rcq) - return -EINVAL; - ep->rcq = cq; - } - if (flags & FI_TRANSMIT) { - if (ep->scq) - return -EINVAL; - ep->scq = cq; - } - break; - case FI_CLASS_AV: - av = container_of(bfid, struct efa_av, util_av.av_fid.fid); - ret = efa_base_ep_bind_av(&ep->base_ep, av); - if (ret) - return ret; - break; - case FI_CLASS_CNTR: - cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); - - ret = ofi_ep_bind_cntr(&ep->base_ep.util_ep, cntr, flags); - if (ret) - return ret; - break; - case FI_CLASS_EQ: - eq = container_of(bfid, struct util_eq, eq_fid.fid); - - ret = ofi_ep_bind_eq(&ep->base_ep.util_ep, eq); - if (ret) - return ret; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int efa_dgram_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_tx_attr *tx_attr = ep->base_ep.info->tx_attr; - struct fi_rx_attr *rx_attr = ep->base_ep.info->rx_attr; - - if ((*flags & FI_TRANSMIT) && (*flags & FI_RECV)) { - EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified\n"); - return -FI_EINVAL; - } else if (tx_attr && (*flags & FI_TRANSMIT)) { - *flags = tx_attr->op_flags; - } else if (rx_attr && (*flags & FI_RECV)) { - *flags = rx_attr->op_flags; - } else { - EFA_WARN(FI_LOG_EP_CTRL, "Tx/Rx flags not specified\n"); - return -FI_EINVAL; - } - return 0; -} - -static int efa_dgram_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_tx_attr *tx_attr = ep->base_ep.info->tx_attr; - struct fi_rx_attr *rx_attr = ep->base_ep.info->rx_attr; - - if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) { - EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified.\n"); - return -FI_EINVAL; - } else if (tx_attr && (flags & FI_TRANSMIT)) { - tx_attr->op_flags = flags; - tx_attr->op_flags &= ~FI_TRANSMIT; - } else if (rx_attr && (flags & FI_RECV)) { - rx_attr->op_flags = flags; - rx_attr->op_flags &= ~FI_RECV; - } else { - EFA_WARN(FI_LOG_EP_CTRL, "Tx/Rx flags not specified\n"); - return -FI_EINVAL; - } - - return 0; -} - -static int efa_dgram_ep_enable(struct fid_ep *ep_fid) -{ - struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct ibv_pd *ibv_pd; - struct efa_dgram_ep *ep; - int err; - - ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - if (!ep->scq && !ep->rcq) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send or receive completion queue\n"); - return -FI_ENOCQ; - } - - if (!ep->scq && ofi_send_allowed(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); - return -FI_ENOCQ; - } - - if (!ep->rcq && ofi_recv_allowed(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); - return -FI_ENOCQ; - } - - if (ep->scq) { - attr_ex.cap.max_send_wr = ep->base_ep.info->tx_attr->size; - attr_ex.cap.max_send_sge = ep->base_ep.info->tx_attr->iov_limit; - attr_ex.send_cq = ibv_cq_ex_to_cq(ep->scq->ibv_cq_ex); - ibv_pd = ep->scq->domain->ibv_pd; - } else { - attr_ex.send_cq = ibv_cq_ex_to_cq(ep->rcq->ibv_cq_ex); - ibv_pd = ep->rcq->domain->ibv_pd; - } - - if (ep->rcq) { - attr_ex.cap.max_recv_wr = ep->base_ep.info->rx_attr->size; - attr_ex.cap.max_recv_sge = ep->base_ep.info->rx_attr->iov_limit; - attr_ex.recv_cq = ibv_cq_ex_to_cq(ep->rcq->ibv_cq_ex); - } else { - attr_ex.recv_cq = ibv_cq_ex_to_cq(ep->scq->ibv_cq_ex); - } - - attr_ex.cap.max_inline_data = - ep->base_ep.domain->device->efa_attr.inline_buf_size; - - assert(EFA_EP_TYPE_IS_DGRAM(ep->base_ep.domain->info)); - attr_ex.qp_type = IBV_QPT_UD; - attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; - attr_ex.pd = ibv_pd; - - attr_ex.qp_context = ep; - attr_ex.sq_sig_all = 1; - - err = efa_base_ep_create_qp(&ep->base_ep, &attr_ex); - if (err) - return err; - - return efa_base_ep_enable(&ep->base_ep); -} - -static int efa_dgram_ep_control(struct fid *fid, int command, void *arg) -{ - struct fid_ep *ep_fid; - - switch (fid->fclass) { - case FI_CLASS_EP: - ep_fid = container_of(fid, struct fid_ep, fid); - switch (command) { - case FI_GETOPSFLAG: - return efa_dgram_ep_getflags(ep_fid, (uint64_t *)arg); - case FI_SETOPSFLAG: - return efa_dgram_ep_setflags(ep_fid, *(uint64_t *)arg); - case FI_ENABLE: - return efa_dgram_ep_enable(ep_fid); - default: - return -FI_ENOSYS; - } - break; - default: - return -FI_ENOSYS; - } -} - -static struct fi_ops efa_dgram_ep_ops = { - .size = sizeof(struct fi_ops), - .close = efa_dgram_ep_close, - .bind = efa_dgram_ep_bind, - .control = efa_dgram_ep_control, - .ops_open = fi_no_ops_open, -}; - -static void efa_dgram_ep_progress_internal(struct efa_dgram_ep *ep, struct efa_dgram_cq *efa_dgram_cq) -{ - struct util_cq *cq; - struct fi_cq_tagged_entry cq_entry[efa_dgram_cq_PROGRESS_ENTRIES] = {0}; - struct fi_cq_tagged_entry *temp_cq_entry; - struct fi_cq_err_entry cq_err_entry = {0}; - fi_addr_t src_addr[efa_dgram_cq_PROGRESS_ENTRIES]; - uint64_t flags; - int i; - ssize_t ret, err; - - cq = &efa_dgram_cq->util_cq; - flags = ep->base_ep.util_ep.caps; - - VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(cq_entry)); - - ret = efa_dgram_cq_readfrom(&cq->cq_fid, cq_entry, efa_dgram_cq_PROGRESS_ENTRIES, - (flags & FI_SOURCE) ? src_addr : NULL); - if (ret == -FI_EAGAIN) - return; - - if (OFI_UNLIKELY(ret < 0)) { - if (OFI_UNLIKELY(ret != -FI_EAVAIL)) { - EFA_WARN(FI_LOG_CQ, "no error available errno: %ld\n", ret); - efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_DGRAM_CQ_READ); - return; - } - - err = efa_dgram_cq_readerr(&cq->cq_fid, &cq_err_entry, flags); - if (OFI_UNLIKELY(err < 0)) { - EFA_WARN(FI_LOG_CQ, "unable to read error entry errno: %ld\n", err); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, cq_err_entry.prov_errno); - return; - } - - ofi_cq_write_error(cq, &cq_err_entry); - return; - } - - temp_cq_entry = (struct fi_cq_tagged_entry *)cq_entry; - for (i = 0; i < ret; i++) { - (flags & FI_SOURCE) ? - ofi_cq_write_src(cq, temp_cq_entry->op_context, - temp_cq_entry->flags, - temp_cq_entry->len, - temp_cq_entry->buf, - temp_cq_entry->data, - temp_cq_entry->tag, - src_addr[i]) : - ofi_cq_write(cq, temp_cq_entry->op_context, - temp_cq_entry->flags, - temp_cq_entry->len, - temp_cq_entry->buf, - temp_cq_entry->data, - temp_cq_entry->tag); - - temp_cq_entry = (struct fi_cq_tagged_entry *) - ((uint8_t *)temp_cq_entry + efa_dgram_cq->entry_size); - } - return; -} - -void efa_dgram_ep_progress(struct util_ep *ep) -{ - struct efa_dgram_ep *efa_dgram_ep; - struct efa_dgram_cq *rcq; - struct efa_dgram_cq *scq; - - efa_dgram_ep = container_of(ep, struct efa_dgram_ep, base_ep.util_ep); - rcq = efa_dgram_ep->rcq; - scq = efa_dgram_ep->scq; - - ofi_genlock_lock(&ep->lock); - - if (rcq) - efa_dgram_ep_progress_internal(efa_dgram_ep, rcq); - - if (scq && scq != rcq) - efa_dgram_ep_progress_internal(efa_dgram_ep, scq); - - ofi_genlock_unlock(&ep->lock); -} - -static struct fi_ops_atomic efa_dgram_ep_atomic_ops = { - .size = sizeof(struct fi_ops_atomic), - .write = fi_no_atomic_write, - .writev = fi_no_atomic_writev, - .writemsg = fi_no_atomic_writemsg, - .inject = fi_no_atomic_inject, - .readwrite = fi_no_atomic_readwrite, - .readwritev = fi_no_atomic_readwritev, - .readwritemsg = fi_no_atomic_readwritemsg, - .compwrite = fi_no_atomic_compwrite, - .compwritev = fi_no_atomic_compwritev, - .compwritemsg = fi_no_atomic_compwritemsg, - .writevalid = fi_no_atomic_writevalid, - .readwritevalid = fi_no_atomic_readwritevalid, - .compwritevalid = fi_no_atomic_compwritevalid, -}; - -struct fi_ops_cm efa_dgram_ep_cm_ops = { - .size = sizeof(struct fi_ops_cm), - .setname = fi_no_setname, - .getname = efa_base_ep_getname, - .getpeer = fi_no_getpeer, - .connect = fi_no_connect, - .listen = fi_no_listen, - .accept = fi_no_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, - .join = fi_no_join, -}; - -int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, - struct fid_ep **ep_fid, void *context) -{ - struct efa_domain *domain; - const struct fi_info *prov_info; - struct efa_dgram_ep *ep; - int ret; - - domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - - if (!user_info || !user_info->ep_attr || !user_info->domain_attr || - strncmp(domain->device->ibv_ctx->device->name, user_info->domain_attr->name, - strlen(domain->device->ibv_ctx->device->name))) { - EFA_INFO(FI_LOG_DOMAIN, "Invalid info->domain_attr->name\n"); - return -FI_EINVAL; - } - - prov_info = efa_domain_get_prov_info(domain, user_info->ep_attr->type); - assert(prov_info); - - assert(user_info->ep_attr); - ret = ofi_check_ep_attr(&efa_util_prov, user_info->fabric_attr->api_version, prov_info, user_info); - if (ret) - return ret; - - if (user_info->tx_attr) { - ret = ofi_check_tx_attr(&efa_prov, prov_info->tx_attr, - user_info->tx_attr, user_info->mode); - if (ret) - return ret; - } - - if (user_info->rx_attr) { - ret = ofi_check_rx_attr(&efa_prov, prov_info, user_info->rx_attr, user_info->mode); - if (ret) - return ret; - } - - ep = calloc(1, sizeof(*ep)); - if (!ep) - return -FI_ENOMEM; - - ret = efa_base_ep_construct(&ep->base_ep, domain_fid, user_info, efa_dgram_ep_progress, context); - if (ret) - goto err_ep_destroy; - - /* struct efa_send_wr and efa_recv_wr allocates memory for 2 IOV - * So check with an assert statement that iov_limit is 2 or less - */ - assert(user_info->tx_attr->iov_limit <= 2); - - ret = ofi_bufpool_create(&ep->send_wr_pool, - sizeof(struct efa_send_wr), 16, 0, 1024, 0); - if (ret) - goto err_ep_destroy; - - ret = ofi_bufpool_create(&ep->recv_wr_pool, - sizeof(struct efa_recv_wr), 16, 0, 1024, 0); - if (ret) - goto err_send_wr_destroy; - - ep->base_ep.domain = domain; - - *ep_fid = &ep->base_ep.util_ep.ep_fid; - (*ep_fid)->fid.fclass = FI_CLASS_EP; - (*ep_fid)->fid.context = context; - (*ep_fid)->fid.ops = &efa_dgram_ep_ops; - (*ep_fid)->ops = &efa_dgram_ep_base_ops; - (*ep_fid)->msg = &efa_dgram_ep_msg_ops; - (*ep_fid)->cm = &efa_dgram_ep_cm_ops; - (*ep_fid)->rma = &efa_dgram_ep_rma_ops; - (*ep_fid)->atomic = &efa_dgram_ep_atomic_ops; - - return 0; - -err_send_wr_destroy: - ofi_bufpool_destroy(ep->send_wr_pool); -err_ep_destroy: - efa_dgram_ep_destroy(ep); - return ret; -} diff --git a/prov/efa/src/dgram/efa_dgram_ep.h b/prov/efa/src/dgram/efa_dgram_ep.h deleted file mode 100644 index ecc8f1772dd..00000000000 --- a/prov/efa/src/dgram/efa_dgram_ep.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include "efa_base_ep.h" - -#ifndef EFA_DGRAM_H -#define EFA_DGRAM_H - -struct efa_dgram_ep { - struct efa_base_ep base_ep; - - struct efa_dgram_cq *rcq; - struct efa_dgram_cq *scq; - - struct ofi_bufpool *send_wr_pool; - struct ofi_bufpool *recv_wr_pool; -}; - -struct efa_send_wr { - /** @brief Work request struct used by rdma-core */ - struct ibv_send_wr wr; - - /** @brief Scatter gather element array - * - * @details - * EFA device supports a maximum of 2 iov/SGE - */ - struct ibv_sge sge[2]; -}; - - -int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *info, - struct fid_ep **ep_fid, void *context); - -extern struct fi_ops_msg efa_dgram_ep_msg_ops; -extern struct fi_ops_rma efa_dgram_ep_rma_ops; -#endif diff --git a/prov/efa/src/dgram/efa_dgram_msg.c b/prov/efa/src/dgram/efa_dgram_msg.c deleted file mode 100644 index f8a5010daf9..00000000000 --- a/prov/efa/src/dgram/efa_dgram_msg.c +++ /dev/null @@ -1,445 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include "config.h" - - -#include "ofi.h" -#include "ofi_enosys.h" -#include "ofi_iov.h" - -#include "efa_dgram_ep.h" -#include "efa.h" -#include "efa_av.h" - -#include "efa_tp.h" - -#define EFA_SETUP_IOV(iov, buf, len) \ - do { \ - iov.iov_base = (void *)buf; \ - iov.iov_len = (size_t)len; \ - } while (0) - -#define EFA_SETUP_MSG(msg, iov, _desc, count, _addr, _context, _data) \ - do { \ - msg.msg_iov = (const struct iovec *)iov; \ - msg.desc = (void **)_desc; \ - msg.iov_count = (size_t)count; \ - msg.addr = (fi_addr_t)_addr; \ - msg.context = (void *)_context; \ - msg.data = (uint32_t)_data; \ - } while (0) - -#ifndef EFA_MSG_DUMP -static inline void dump_msg(const struct fi_msg *msg, const char *context) {} -#else -#define DUMP_IOV(i, iov, desc) \ - EFA_DBG(FI_LOG_EP_DATA, \ - "\t{ iov[%d] = { base = %p, buff = \"%s\", len = %zu }, desc = %p },\n", \ - i, iov.iov_base, (char *)iov.iov_base, iov.iov_len, (desc ? desc[i] : NULL)) - -static inline void dump_msg(const struct fi_msg *msg, const char *context) -{ - int i; - - EFA_DBG(FI_LOG_EP_DATA, "%s: { data = %u, addr = %" PRIu64 ", iov_count = %zu, [\n", - context, (unsigned)msg->data, msg->addr, msg->iov_count); - for (i = 0; i < msg->iov_count; ++i) - DUMP_IOV(i, msg->msg_iov[i], msg->desc); - EFA_DBG(FI_LOG_EP_DATA, " ] }\n"); -} -#endif /* EFA_MSG_DUMP */ - -static void free_send_wr_list(struct ibv_send_wr *head) -{ - struct ibv_send_wr *wr = head; - struct ibv_send_wr *tmp; - - while (wr) { - tmp = wr->next; - ofi_buf_free(container_of(wr, struct efa_send_wr, wr)); - wr = tmp; - } -} - -static void free_recv_wr_list(struct ibv_recv_wr *head) -{ - struct ibv_recv_wr *wr = head; - struct ibv_recv_wr *tmp; - - while (wr) { - tmp = wr->next; - ofi_buf_free(container_of(wr, struct efa_recv_wr, wr)); - wr = tmp; - } -} - -static ssize_t efa_dgram_post_recv_validate(struct efa_dgram_ep *ep, const struct fi_msg *msg) -{ - if (OFI_UNLIKELY(!ep->rcq)) { - EFA_WARN(FI_LOG_EP_DATA, "No receive cq was bound to ep.\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.info->rx_attr->iov_limit)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", - msg->iov_count, ep->base_ep.info->tx_attr->iov_limit); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->msg_iov[0].iov_len < - ep->base_ep.info->ep_attr->msg_prefix_size)) { - EFA_WARN(FI_LOG_EP_DATA, "prefix not present on first iov, iov_len[%zu]\n", - msg->msg_iov[0].iov_len); - return -EINVAL; - } - - return 0; -} - -/** - * @brief post receive buffer to EFA device via ibv_post_recv - * - * @param[in] ep endpoint - * @param[in] msg libfabric message - * @param[in] flags libfabric flags, currently only FI_MORE is supported. - * @reutrn On Success, return 0 - * On failure, return negative libfabric error code - */ -static ssize_t efa_dgram_post_recv(struct efa_dgram_ep *ep, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_mr *efa_mr; - struct efa_qp *qp = ep->base_ep.qp; - struct ibv_recv_wr *bad_wr; - struct efa_recv_wr *ewr; - struct ibv_recv_wr *wr; - uintptr_t addr; - ssize_t err, post_recv_err; - size_t i; - - ewr = ofi_buf_alloc(ep->recv_wr_pool); - if (OFI_UNLIKELY(!ewr)) - return -FI_ENOMEM; - - memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); - wr = &ewr->wr; - dump_msg(msg, "recv"); - - err = efa_dgram_post_recv_validate(ep, msg); - if (OFI_UNLIKELY(err)) { - ofi_buf_free(ewr); - goto out_err; - } - - wr->wr_id = (uintptr_t)msg->context; - wr->num_sge = msg->iov_count; - wr->sg_list = ewr->sge; - - for (i = 0; i < msg->iov_count; i++) { - addr = (uintptr_t)msg->msg_iov[i].iov_base; - - /* Set RX buffer desc from SGE */ - wr->sg_list[i].length = msg->msg_iov[i].iov_len; - assert(msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - wr->sg_list[i].lkey = efa_mr->ibv_mr->lkey; - wr->sg_list[i].addr = addr; - } - - ep->base_ep.recv_more_wr_tail->next = wr; - ep->base_ep.recv_more_wr_tail = wr; - - if (flags & FI_MORE) - return 0; - -#if HAVE_LTTNG - struct ibv_recv_wr *head = ep->base_ep.recv_more_wr_head.next; - while (head) { - efa_tracepoint_wr_id_post_recv((void *) head->wr_id); - head = head->next; - } -#endif - - err = ibv_post_recv(qp->ibv_qp, ep->base_ep.recv_more_wr_head.next, &bad_wr); - if (OFI_UNLIKELY(err)) { - /* On failure, ibv_post_recv() return positive errno. - * Meanwhile, this function return a negative errno. - * So, we do the conversion here. - */ - err = (err == ENOMEM) ? -FI_EAGAIN : -err; - } - - free_recv_wr_list(ep->base_ep.recv_more_wr_head.next); - ep->base_ep.recv_more_wr_tail = &ep->base_ep.recv_more_wr_head; - - return err; - -out_err: - if (ep->base_ep.recv_more_wr_head.next) { - post_recv_err = ibv_post_recv(qp->ibv_qp, ep->base_ep.recv_more_wr_head.next, &bad_wr); - if (post_recv_err) { - EFA_WARN(FI_LOG_EP_DATA, - "Encountered error %ld when ibv_post_recv on error handling path\n", - post_recv_err); - } - } - - free_recv_wr_list(ep->base_ep.recv_more_wr_head.next); - ep->base_ep.recv_more_wr_tail = &ep->base_ep.recv_more_wr_head; - - return err; -} - -static ssize_t efa_dgram_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_post_recv(ep, msg, flags); -} - -static ssize_t efa_dgram_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct iovec iov; - struct fi_msg msg; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); - - return efa_dgram_post_recv(ep, &msg, 0); -} - -static ssize_t efa_dgram_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - - EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); - - return efa_dgram_post_recv(ep, &msg, 0); -} - -static ssize_t efa_dgram_post_send_validate(struct efa_dgram_ep *ep, const struct fi_msg *msg, - struct efa_conn *conn, uint64_t flags, size_t *len) -{ - if (OFI_UNLIKELY(!ep->scq)) { - EFA_WARN(FI_LOG_EP_DATA, "No send cq was bound to ep.\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.info->tx_attr->iov_limit)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", - msg->iov_count, ep->base_ep.info->tx_attr->iov_limit); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->msg_iov[0].iov_len < - ep->base_ep.info->ep_attr->msg_prefix_size)) { - EFA_WARN(FI_LOG_EP_DATA, "prefix not present on first iov, iov_len[%zu]\n", - msg->msg_iov[0].iov_len); - return -EINVAL; - } - - *len = ofi_total_iov_len(msg->msg_iov, msg->iov_count) - ep->base_ep.info->ep_attr->msg_prefix_size; - if (OFI_UNLIKELY(*len > ep->base_ep.info->ep_attr->max_msg_size)) { - EFA_WARN(FI_LOG_EP_DATA, "requested size[%zu] is greater than max[%zu]!\n", - *len, ep->base_ep.info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - - return 0; -} - -static void efa_dgram_post_send_sgl(struct efa_dgram_ep *ep, const struct fi_msg *msg, - struct efa_send_wr *ewr) -{ - struct efa_mr *efa_mr; - struct ibv_send_wr *wr = &ewr->wr; - struct ibv_sge *sge; - uint32_t length; - uintptr_t addr; - size_t i; - - wr->num_sge = msg->iov_count; - wr->sg_list = ewr->sge; - - for (i = 0; i < msg->iov_count; i++) { - sge = &wr->sg_list[i]; - addr = (uintptr_t)msg->msg_iov[i].iov_base; - length = msg->msg_iov[i].iov_len; - - /* Whole prefix must be on the first sgl */ - if (!i) { - /* Check if payload exists */ - if (length <= ep->base_ep.info->ep_attr->msg_prefix_size) - continue; - - addr += ep->base_ep.info->ep_attr->msg_prefix_size; - length -= ep->base_ep.info->ep_attr->msg_prefix_size; - } - - /* Set TX buffer desc from SGE */ - sge->length = length; - assert (msg->desc && msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - sge->lkey = efa_mr->ibv_mr->lkey; - sge->addr = addr; - } -} - -ssize_t efa_dgram_post_flush(struct efa_dgram_ep *ep, struct ibv_send_wr **bad_wr, bool free) -{ - ssize_t ret; - -#if HAVE_LTTNG - struct ibv_send_wr *head = ep->base_ep.xmit_more_wr_head.next; - while (head) { - efa_tracepoint_wr_id_post_send((void *) head->wr_id); - head = head->next; - } -#endif - - ret = ibv_post_send(ep->base_ep.qp->ibv_qp, ep->base_ep.xmit_more_wr_head.next, bad_wr); - if (free) - free_send_wr_list(ep->base_ep.xmit_more_wr_head.next); - else - ep->base_ep.xmit_more_wr_head.next = NULL; - ep->base_ep.xmit_more_wr_tail = &ep->base_ep.xmit_more_wr_head; - return ret; -} - -static bool efa_msg_has_hmem_mr(const struct fi_msg *msg) -{ - /* the device only support send up 2 iov, so iov_count cannot be > 2 */ - assert(msg->iov_count == 1 || msg->iov_count == 2); - /* first iov is always on host memory, because it must contain packet header */ - assert(!efa_mr_is_hmem(msg->desc[0])); - return (msg->iov_count == 2) && efa_mr_is_hmem(msg->desc[1]); -} - -static ssize_t efa_dgram_post_send(struct efa_dgram_ep *ep, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_qp *qp = ep->base_ep.qp; - struct ibv_send_wr *bad_wr; - struct efa_send_wr *ewr; - struct ibv_send_wr *wr; - struct efa_conn *conn; - size_t len; - int ret; - - dump_msg(msg, "send"); - - ewr = ofi_buf_alloc(ep->send_wr_pool); - if (OFI_UNLIKELY(!ewr)) - return -FI_ENOMEM; - - memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); - wr = &ewr->wr; - conn = efa_av_addr_to_conn(ep->base_ep.av, msg->addr); - assert(conn && conn->ep_addr); - - ret = efa_dgram_post_send_validate(ep, msg, conn, flags, &len); - if (OFI_UNLIKELY(ret)) { - ofi_buf_free(ewr); - goto out_err; - } - - efa_dgram_post_send_sgl(ep, msg, ewr); - - if (len <= ep->base_ep.domain->device->efa_attr.inline_buf_size && - !efa_msg_has_hmem_mr(msg)) - wr->send_flags |= IBV_SEND_INLINE; - - wr->opcode = IBV_WR_SEND; - wr->wr_id = (uintptr_t)msg->context; - wr->wr.ud.ah = conn->ah->ibv_ah; - wr->wr.ud.remote_qpn = conn->ep_addr->qpn; - wr->wr.ud.remote_qkey = conn->ep_addr->qkey; - - ep->base_ep.xmit_more_wr_tail->next = wr; - ep->base_ep.xmit_more_wr_tail = wr; - - if (flags & FI_MORE) - return 0; - - ret = efa_dgram_post_flush(ep, &bad_wr, true /* free ibv_send_wr */); - - return ret; - -out_err: - if (ep->base_ep.xmit_more_wr_head.next) - ibv_post_send(qp->ibv_qp, ep->base_ep.xmit_more_wr_head.next, &bad_wr); - - free_send_wr_list(ep->base_ep.xmit_more_wr_head.next); - ep->base_ep.xmit_more_wr_tail = &ep->base_ep.xmit_more_wr_head; - - return ret; -} - -static ssize_t efa_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_post_send(ep, msg, flags); -} - -static ssize_t efa_dgram_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - struct iovec iov; - uint64_t flags; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); - flags = ep->base_ep.info->tx_attr->op_flags; - - return efa_dgram_post_send(ep, &msg, flags); -} - -static ssize_t efa_dgram_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - struct iovec iov; - uint64_t flags; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); - - flags = ep->base_ep.info->tx_attr->op_flags | FI_REMOTE_CQ_DATA; - - return efa_dgram_post_send(ep, &msg, flags); -} - -static ssize_t efa_dgram_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - uint64_t flags; - - EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); - - flags = ep->base_ep.info->tx_attr->op_flags; - - return efa_dgram_post_send(ep, &msg, flags); -} - -struct fi_ops_msg efa_dgram_ep_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = efa_dgram_ep_recv, - .recvv = efa_dgram_ep_recvv, - .recvmsg = efa_dgram_ep_recvmsg, - .send = efa_dgram_ep_send, - .sendv = efa_dgram_ep_sendv, - .sendmsg = efa_dgram_ep_sendmsg, - .inject = fi_no_msg_inject, - .senddata = efa_dgram_ep_senddata, - .injectdata = fi_no_msg_injectdata, -}; diff --git a/prov/efa/src/dgram/efa_dgram_rma.c b/prov/efa/src/dgram/efa_dgram_rma.c deleted file mode 100644 index 99f4c1a2929..00000000000 --- a/prov/efa/src/dgram/efa_dgram_rma.c +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include -#include -#include -#include -#include "efa_dgram_ep.h" -#include "efa.h" -#include "efa_av.h" - - -/* - * efa_dgram_rma_post_read() will post a read request. - * - * Input: - * ep: endpoint - * msg: read operation information - * flags: currently no flags is taken - * self_comm: indicate whether the read is toward - * the end point itself. If self_comm is true, - * caller must set msg->addr to FI_ADDR_NOTAVAIL. - * - * On success return 0, - * If read iov and rma_iov count out of device limit, return -FI_EINVAL - * If read failed, return the error of read operation - */ -ssize_t efa_dgram_rma_post_read(struct efa_dgram_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, bool self_comm) -{ - struct efa_qp *qp; - struct efa_mr *efa_mr; - struct efa_conn *conn; -#ifndef _WIN32 - struct ibv_sge sge_list[msg->iov_count]; -#else - /* MSVC compiler does not support array declarations with runtime size, so hardcode - * the expected iov_limit/max_sq_sge from the lower-level efa provider. - */ - struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; -#endif - int i; - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.domain->device->ibv_attr.max_sge_rd)) { - EFA_WARN(FI_LOG_CQ, "invalid iov_count!\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->rma_iov_count > ep->base_ep.domain->info->tx_attr->rma_iov_limit)) { - EFA_WARN(FI_LOG_CQ, "invalid rma_iov_count!\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(ofi_total_iov_len(msg->msg_iov, msg->iov_count) - > ep->base_ep.domain->device->max_rdma_size)) { - EFA_WARN(FI_LOG_CQ, "maximum rdma_size exceeded!\n"); - return -FI_EINVAL; - } - - /* caller must provide desc because EFA require FI_MR_LOCAL */ - assert(msg->desc); - - /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ - qp = ep->base_ep.qp; - ibv_wr_start(qp->ibv_qp_ex); - qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; - ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); - - for (i = 0; i < msg->iov_count; ++i) { - sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; - sge_list[i].length = msg->msg_iov[i].iov_len; - assert(msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - sge_list[i].lkey = efa_mr->ibv_mr->lkey; - } - - ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); - if (self_comm) { - assert(msg->addr == FI_ADDR_NOTAVAIL); - ibv_wr_set_ud_addr(qp->ibv_qp_ex, ep->base_ep.self_ah, - qp->qp_num, qp->qkey); - } else { - conn = efa_av_addr_to_conn(ep->base_ep.av, msg->addr); - assert(conn && conn->ep_addr); - ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, - conn->ep_addr->qpn, conn->ep_addr->qkey); - } - - return ibv_wr_complete(qp->ibv_qp_ex); -} - -static -ssize_t efa_dgram_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_rma_post_read(ep, msg, flags, false); -} - -static -ssize_t efa_dgram_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t src_addr, uint64_t addr, - uint64_t key, void *context) -{ - struct fi_rma_iov rma_iov; - struct fi_msg_rma msg; - - rma_iov.addr = addr; - rma_iov.len = ofi_total_iov_len(iov, iov_count); - rma_iov.key = key; - - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = iov_count; - msg.addr = src_addr; - msg.context = context; - msg.rma_iov = &rma_iov; - msg.rma_iov_count = 1; - - return efa_dgram_rma_readmsg(ep, &msg, 0); -} - -static -ssize_t efa_dgram_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t addr, uint64_t key, - void *context) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - return efa_dgram_rma_readv(ep, &iov, &desc, 1, src_addr, addr, key, context); -} - -struct fi_ops_rma efa_dgram_ep_rma_ops = { - .size = sizeof(struct fi_ops_rma), - .read = efa_dgram_rma_read, - .readv = efa_dgram_rma_readv, - .readmsg = efa_dgram_rma_readmsg, - .write = fi_no_rma_write, - .writev = fi_no_rma_writev, - .writemsg = fi_no_rma_writemsg, - .inject = fi_no_rma_inject, - .writedata = fi_no_rma_writedata, - .injectdata = fi_no_rma_injectdata, -}; - diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index e8325330406..5f1cf162c2b 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -107,6 +107,41 @@ struct efa_fabric { #endif }; +struct efa_context { + uint64_t completion_flags; + fi_addr_t addr; +}; + +#if defined(static_assert) +static_assert(sizeof(struct efa_context) <= sizeof(struct fi_context2), + "efa_context must not be larger than fi_context2"); +#endif + +/** + * Prepare and return a pointer to an EFA context structure. + * + * @param context Pointer to the msg context. + * @param addr Peer address associated with the operation. + * @param flags Operation flags (e.g., FI_COMPLETION). + * @param completion_flags Completion flags reported in the cq entry. + * @return A pointer to an initialized EFA context structure, + * or NULL if context is invalid or FI_COMPLETION is not set. + */ +static inline struct efa_context *efa_fill_context(const void *context, + fi_addr_t addr, + uint64_t flags, + uint64_t completion_flags) +{ + if (!context || !(flags & FI_COMPLETION)) + return NULL; + + struct efa_context *efa_context = (struct efa_context *) context; + efa_context->completion_flags = completion_flags; + efa_context->addr = addr; + + return efa_context; +} + static inline int efa_str_to_ep_addr(const char *node, const char *service, struct efa_ep_addr *addr) { @@ -221,4 +256,27 @@ static inline void efa_perfset_end(struct efa_rdm_ep *ep, size_t index) #define efa_perfset_end(ep, index) do {} while (0) #endif +static inline +bool efa_use_unsolicited_write_recv() +{ + return efa_env.use_unsolicited_write_recv && efa_device_support_unsolicited_write_recv(); +} + +/** + * Convenience macro for setopt with an enforced threshold + */ +#define EFA_EP_SETOPT_THRESHOLD(opt, field, threshold) { \ + size_t _val = *(size_t *) optval; \ + if (optlen != sizeof field) \ + return -FI_EINVAL; \ + if (_val > threshold) { \ + EFA_WARN(FI_LOG_EP_CTRL, \ + "Requested size of %zu for FI_OPT_" #opt " " \ + "exceeds the maximum (%zu)\n", \ + _val, threshold); \ + return -FI_EINVAL; \ + } \ + field = _val; \ +} + #endif /* EFA_H */ diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 0b692ed21a8..9c574c54121 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -53,13 +53,9 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) struct util_av_entry *util_av_entry; struct efa_av_entry *efa_av_entry; - if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC)) + if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) return NULL; - if (av->type == FI_AV_MAP) { - return (struct efa_conn *)fi_addr; - } - assert(av->type == FI_AV_TABLE); util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); if (!util_av_entry) @@ -70,7 +66,7 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) } /** - * @brief find fi_addr for dgram endpoint + * @brief find fi_addr for efa endpoint * * @param[in] av address vector * @param[in] ahn address handle number @@ -78,7 +74,7 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) * @return On success, return fi_addr to the peer who send the packet * If no such peer exist, return FI_ADDR_NOTAVAIL */ -fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn) +fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) { struct efa_cur_reverse_av *cur_entry; struct efa_cur_reverse_av_key cur_key; @@ -448,7 +444,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, struct util_av_entry *util_av_entry = NULL; struct efa_av_entry *efa_av_entry = NULL; struct efa_conn *conn; - fi_addr_t util_av_fi_addr; + fi_addr_t fi_addr; int err; if (flags & FI_SYNC_ERR) @@ -460,7 +456,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, return NULL; } - err = ofi_av_insert_addr(&av->util_av, raw_addr, &util_av_fi_addr); + err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr); if (err) { EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", fi_strerror(err)); @@ -468,16 +464,15 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, } util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, - util_av_fi_addr); + fi_addr); efa_av_entry = (struct efa_av_entry *)util_av_entry->data; assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr)); conn = &efa_av_entry->conn; memset(conn, 0, sizeof(*conn)); conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr; - assert(av->type == FI_AV_MAP || av->type == FI_AV_TABLE); - conn->fi_addr = (av->type == FI_AV_MAP) ? (uintptr_t)(void *)conn : util_av_fi_addr; - conn->util_av_fi_addr = util_av_fi_addr; + assert(av->type == FI_AV_TABLE); + conn->fi_addr = fi_addr; conn->ah = efa_ah_alloc(av, raw_addr->raw); if (!conn->ah) @@ -506,7 +501,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, efa_ah_release(av, conn->ah); conn->ep_addr = NULL; - err = ofi_av_remove_addr(&av->util_av, util_av_fi_addr); + err = ofi_av_remove_addr(&av->util_av, fi_addr); if (err) EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", err); @@ -556,11 +551,11 @@ void efa_conn_release(struct efa_av *av, struct efa_conn *conn) efa_ah_release(av, conn->ah); - util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, conn->util_av_fi_addr); + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, conn->fi_addr); assert(util_av_entry); efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - err = ofi_av_remove_addr(&av->util_av, conn->util_av_fi_addr); + err = ofi_av_remove_addr(&av->util_av, conn->fi_addr); if (err) { EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); } @@ -691,7 +686,7 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); struct efa_conn *conn = NULL; - if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE) + if (av->type != FI_AV_TABLE) return -FI_EINVAL; if (fi_addr == FI_ADDR_NOTAVAIL) @@ -744,7 +739,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, return -FI_EINVAL; av = container_of(av_fid, struct efa_av, util_av.av_fid); - if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE) + if (av->type != FI_AV_TABLE) return -FI_EINVAL; ofi_mutex_lock(&av->util_av.lock); @@ -897,6 +892,10 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, if (!av) return -FI_ENOMEM; + if (attr->type == FI_AV_MAP) { + EFA_WARN(FI_LOG_AV, "FI_AV_MAP is deprecated in Libfabric 2.x. Please use FI_AV_TABLE. " + "EFA provider will now switch to using FI_AV_TABLE.\n"); + } attr->type = FI_AV_TABLE; efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 75acd87fdd7..bd4d4a2d74e 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -22,10 +22,7 @@ struct efa_ah { struct efa_conn { struct efa_ah *ah; struct efa_ep_addr *ep_addr; - /* for FI_AV_TABLE, fi_addr is same as util_av_fi_addr, - * for FI_AV_MAP, fi_addr is pointer to efa_conn; */ fi_addr_t fi_addr; - fi_addr_t util_av_fi_addr; struct efa_rdm_peer rdm_peer; }; @@ -86,6 +83,6 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr); fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct efa_rdm_pke *pkt_entry); -fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn); +fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn); #endif \ No newline at end of file diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 8c55fee2387..52dae8a030d 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -4,6 +4,8 @@ #include #include "efa.h" #include "efa_av.h" +#include "efa_cq.h" +#include "efa_cntr.h" #include "rdm/efa_rdm_protocol.h" int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) @@ -86,6 +88,9 @@ int efa_base_ep_destruct(struct efa_base_ep *base_ep) if (base_ep->efa_recv_wr_vec) free(base_ep->efa_recv_wr_vec); + + if (base_ep->user_recv_wr_vec) + free(base_ep->user_recv_wr_vec); return err; } @@ -167,7 +172,7 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep, * @param init_attr_ex ibv_qp_init_attr_ex * @return int 0 on success, negative integer on failure */ -int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex) +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, uint32_t tclass) { struct efadv_qp_init_attr efa_attr = { 0 }; @@ -175,20 +180,44 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex) if (!*qp) return -FI_ENOMEM; + init_attr_ex->comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM; + if (init_attr_ex->qp_type == IBV_QPT_UD) { (*qp)->ibv_qp = ibv_create_qp_ex(init_attr_ex->pd->context, init_attr_ex); } else { assert(init_attr_ex->qp_type == IBV_QPT_DRIVER); + if (efa_device_support_rdma_read()) + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; + if (efa_device_support_rdma_write()) { + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + } #if HAVE_CAPS_UNSOLICITED_WRITE_RECV - if (efa_rdm_use_unsolicited_write_recv()) + if (efa_use_unsolicited_write_recv()) efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; #endif efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; +#if HAVE_EFADV_SL + efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; + if (tclass == FI_TC_LOW_LATENCY) + efa_attr.sl = EFA_QP_LOW_LATENCY_SERVICE_LEVEL; +#endif + (*qp)->ibv_qp = efadv_create_qp_ex( + init_attr_ex->pd->context, init_attr_ex, &efa_attr, + sizeof(struct efadv_qp_init_attr)); + } + +#if HAVE_EFADV_SL + if (!(*qp)->ibv_qp && tclass == FI_TC_LOW_LATENCY) { + EFA_INFO(FI_LOG_EP_CTRL, "ibv_create_qp failed with sl %u, errno: %d. Retrying with default sl.\n", efa_attr.sl, errno); + efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; (*qp)->ibv_qp = efadv_create_qp_ex( init_attr_ex->pd->context, init_attr_ex, &efa_attr, sizeof(struct efadv_qp_init_attr)); } +#endif if (!(*qp)->ibv_qp) { EFA_WARN(FI_LOG_EP_CTRL, "ibv_create_qp failed. errno: %d\n", errno); @@ -206,7 +235,7 @@ int efa_base_ep_create_qp(struct efa_base_ep *base_ep, { int ret; - ret = efa_qp_create(&base_ep->qp, init_attr_ex); + ret = efa_qp_create(&base_ep->qp, init_attr_ex, base_ep->info->tx_attr->tclass); if (ret) return ret; @@ -321,18 +350,31 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, return -FI_ENOMEM; } - base_ep->rnr_retry = efa_env.rnr_retry; + /* This is SRD qp's default behavior */ + base_ep->rnr_retry = EFA_RNR_INFINITE_RETRY; - base_ep->xmit_more_wr_tail = &base_ep->xmit_more_wr_head; - base_ep->recv_more_wr_tail = &base_ep->recv_more_wr_head; base_ep->efa_recv_wr_vec = calloc(sizeof(struct efa_recv_wr), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); if (!base_ep->efa_recv_wr_vec) { EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for base_ep->efa_recv_wr_vec!\n"); return -FI_ENOMEM; } + base_ep->user_recv_wr_vec = calloc(sizeof(struct efa_recv_wr), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); + if (!base_ep->user_recv_wr_vec) { + EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for base_ep->user_recv_wr_vec!\n"); + return -FI_ENOMEM; + } + base_ep->recv_wr_index = 0; base_ep->efa_qp_enabled = false; base_ep->qp = NULL; base_ep->user_recv_qp = NULL; + + /* Use device's native limit as the default value of base ep*/ + base_ep->max_msg_size = (size_t) base_ep->domain->device->ibv_port_attr.max_msg_sz; + base_ep->max_rma_size = (size_t) base_ep->domain->device->max_rdma_size; + base_ep->inject_msg_size = (size_t) base_ep->domain->device->efa_attr.inline_buf_size; + /* TODO: update inject_rma_size to inline size after firmware + * supports inline rdma write */ + base_ep->inject_rma_size = 0; return 0; } @@ -444,3 +486,261 @@ void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t pro prov_errno, efa_strerror(prov_errno)); abort(); } + +const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, size_t *buflen) +{ + return ofi_straddr(buf, buflen, FI_ADDR_EFA, &base_ep->src_addr); +} + +/** + * @brief return peer's raw address in #efa_ep_addr + * + * @param[in] ep end point + * @param[in] addr libfabric address + * @returns + * If peer exists, return peer's raw addrress as pointer to #efa_ep_addr; + * Otherwise, return NULL + */ +struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, fi_addr_t addr) +{ + struct efa_av *efa_av; + struct efa_conn *efa_conn; + + efa_av = base_ep->av; + efa_conn = efa_av_addr_to_conn(efa_av, addr); + return efa_conn ? efa_conn->ep_addr : NULL; +} + +/** + * @brief return peer's raw address in a readable string + * + * @param[in] base_ep end point + * @param[in] addr libfabric address + * @param[out] buf a buffer to be used to store string + * @param[in,out] buflen length of `buf` as input. length of the string as output. + * @return a string with peer's raw address + */ +const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, fi_addr_t addr, char *buf, size_t *buflen) +{ + return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_base_ep_get_peer_raw_addr(base_ep, addr)); +} + +struct efa_cq *efa_base_ep_get_tx_cq(struct efa_base_ep *ep) +{ + return ep->util_ep.tx_cq ? container_of(ep->util_ep.tx_cq, struct efa_cq, util_cq) : NULL; +} + +struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep) +{ + return ep->util_ep.rx_cq ? container_of(ep->util_ep.rx_cq, struct efa_cq, util_cq) : NULL; +} + +/** + * @brief Construct the ibv qp init attr for given ep and cq + * + * @param ep a ptr to the efa_base_ep + * @param attr_ex the constructed qp attr + * @param tx_cq tx cq + * @param rx_cq rx cq + */ +static inline +void efa_base_ep_construct_ibv_qp_init_attr_ex(struct efa_base_ep *ep, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_cq_ex *tx_cq, + struct ibv_cq_ex *rx_cq) +{ + struct fi_info *info; + + if (ep->info->ep_attr->type == FI_EP_RDM) { + attr_ex->qp_type = IBV_QPT_DRIVER; + info = ep->domain->device->rdm_info; + } else { + assert(ep->info->ep_attr->type == FI_EP_DGRAM); + attr_ex->qp_type = IBV_QPT_UD; + info = ep->domain->device->dgram_info; + } + attr_ex->cap.max_send_wr = info->tx_attr->size; + attr_ex->cap.max_send_sge = info->tx_attr->iov_limit; + attr_ex->cap.max_recv_wr = info->rx_attr->size; + attr_ex->cap.max_recv_sge = info->rx_attr->iov_limit; + attr_ex->cap.max_inline_data = ep->domain->device->efa_attr.inline_buf_size; + attr_ex->pd = ep->domain->ibv_pd; + attr_ex->qp_context = ep; + attr_ex->sq_sig_all = 1; + + attr_ex->send_cq = ibv_cq_ex_to_cq(tx_cq); + attr_ex->recv_cq = ibv_cq_ex_to_cq(rx_cq); +} + +/** + * @brief check the in order aligned 128 bytes support for a given ibv_wr_op code + * + * @param ep efa_base_ep + * @param op_code ibv wr op code + * @return int 0 if in order aligned 128 bytes is supported, -FI_EOPNOTSUPP if + * it is not supported. Other negative integer for other errors. + */ +int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *ep, + enum ibv_wr_opcode op_code) +{ + struct efa_qp *qp = NULL; + struct ibv_qp_init_attr_ex attr_ex = {0}; + int ret, retv; + struct ibv_cq_ex *ibv_cq_ex = NULL; + enum ibv_cq_ex_type ibv_cq_ex_type; + struct fi_cq_attr cq_attr = {0}; + + ret = efa_cq_ibv_cq_ex_open(&cq_attr, ep->domain->device->ibv_ctx, &ibv_cq_ex, &ibv_cq_ex_type); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %d\n", ret); + ret = -FI_EINVAL; + goto out; + } + + /* Create a dummy qp for query only */ + efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); + + ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC); + if (ret) + goto out; + + if (!efa_qp_support_op_in_order_aligned_128_bytes(qp, op_code)) + ret = -FI_EOPNOTSUPP; + +out: + if (qp) + efa_qp_destruct(qp); + + if (ibv_cq_ex) { + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(ibv_cq_ex)); + if (retv) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv cq: %s\n", + fi_strerror(-retv)); + } + return ret; +} + +/** + * @brief Insert tx/rx cq into the cntrs the ep is bind to + * + * @param ep efa_base_ep + * @return int 0 on success, negative integer on failure + */ +int efa_base_ep_insert_cntr_ibv_cq_poll_list(struct efa_base_ep *ep) +{ + int i, ret; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_cq *tx_cq, *rx_cq; + + tx_cq = efa_base_ep_get_tx_cq(ep); + rx_cq = efa_base_ep_get_rx_cq(ep); + + for (i = 0; i < CNTR_CNT; i++) { + util_cntr = ep->util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + if (ret) + return ret; + } + if (rx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + if (ret) + return ret; + } + ofi_genlock_lock(&efa_cntr->util_cntr.ep_list_lock); + efa_cntr->need_to_scan_ep_list = true; + ofi_genlock_unlock(&efa_cntr->util_cntr.ep_list_lock); + } + } + + return FI_SUCCESS; +} + +/** + * @brief Remove tx/rx cq from the cntr that ep is bind to + * + * @param ep efa_base_ep + */ +void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep) +{ + int i; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_cq *tx_cq, *rx_cq; + + tx_cq = efa_base_ep_get_tx_cq(ep); + rx_cq = efa_base_ep_get_rx_cq(ep); + + for (i = 0; i< CNTR_CNT; i++) { + util_cntr = ep->util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + + if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + } + } +} + +/** + * @brief Create and enable the IBV QP that backs the EP + * + * @param ep efa_base_ep + * @param create_user_recv_qp whether to create the user_recv_qp. This boolean + * is only true for the zero copy recv mode in the efa-rdm endpoint + * + * @return int 0 on success, negative integer on failure + */ +int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp) +{ + struct ibv_qp_init_attr_ex attr_ex = { 0 }; + struct efa_cq *scq, *rcq; + struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; + int err; + + scq = efa_base_ep_get_tx_cq(ep); + rcq = efa_base_ep_get_rx_cq(ep); + + if (!scq && !rcq) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send or receive completion queue\n"); + return -FI_ENOCQ; + } + + if (!scq && ofi_needs_tx(ep->info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); + return -FI_ENOCQ; + } + + if (!rcq && ofi_needs_rx(ep->info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); + return -FI_ENOCQ; + } + + tx_ibv_cq = scq ? scq->ibv_cq.ibv_cq_ex : rcq->ibv_cq.ibv_cq_ex; + rx_ibv_cq = rcq ? rcq->ibv_cq.ibv_cq_ex : scq->ibv_cq.ibv_cq_ex; + + efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); + + err = efa_base_ep_create_qp(ep, &attr_ex); + if (err) + return err; + + if (create_user_recv_qp) { + err = efa_qp_create(&ep->user_recv_qp, &attr_ex, ep->info->tx_attr->tclass); + if (err) { + efa_base_ep_destruct_qp(ep); + return err; + } + ep->user_recv_qp->base_ep = ep; + } + + return efa_base_ep_enable(ep); +} diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index bbcd0d26a2d..cb1edea598b 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -12,6 +12,33 @@ #include "ofi_util.h" #include "rdm/efa_rdm_protocol.h" +#define EFA_QP_DEFAULT_SERVICE_LEVEL 0 +#define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 +#define EFA_ERROR_MSG_BUFFER_LENGTH 1024 + +/* Default rnr_retry for efa-rdm ep. + * If first attempt to send a packet failed, + * this value controls how many times firmware + * retries the send before it report an RNR error + * (via rdma-core error cq entry). + * The valid number is from + * 0 (no retry) + * to + * EFA_RNR_INFINITY_RETRY (retry infinitely) + */ +#define EFA_RDM_DEFAULT_RNR_RETRY (3) +/** + * Infinite retry. + * NOTICE: this is the default rnr_retry + * mode for SRD qp. So modifying qp_attr.rnr_retry + * to this value has the same behavior as + * not modifying qp's rnr_retry attribute + */ +#define EFA_RNR_INFINITE_RETRY (7) + +#define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags) +#define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags) + struct efa_qp { struct ibv_qp *ibv_qp; struct ibv_qp_ex *ibv_qp_ex; @@ -20,6 +47,18 @@ struct efa_qp { uint32_t qkey; }; +#define EFA_GID_LEN 16 + +struct efa_ep_addr { + uint8_t raw[EFA_GID_LEN]; + uint16_t qpn; + uint16_t pad; + uint32_t qkey; + struct efa_ep_addr *next; +}; + +#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) + struct efa_av; struct efa_recv_wr { @@ -48,14 +87,17 @@ struct efa_base_ep { bool efa_qp_enabled; bool is_wr_started; - struct ibv_send_wr xmit_more_wr_head; - struct ibv_send_wr *xmit_more_wr_tail; - struct ibv_recv_wr recv_more_wr_head; - struct ibv_recv_wr *recv_more_wr_tail; struct efa_recv_wr *efa_recv_wr_vec; + size_t recv_wr_index; + + size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ + size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ + size_t inject_msg_size; /**< #FI_OPT_INJECT_MSG_SIZE */ + size_t inject_rma_size; /**< #FI_OPT_INJECT_RMA_SIZE */ /* Only used by RDM ep type */ struct efa_qp *user_recv_qp; /* Separate qp to receive pkts posted by users */ + struct efa_recv_wr *user_recv_wr_vec; }; int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av); @@ -72,7 +114,10 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, int efa_base_ep_getname(fid_t fid, void *addr, size_t *addrlen); -int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex); +int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, + struct fid_ep **ep_fid, void *context); + +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, uint32_t tclass); void efa_qp_destruct(struct efa_qp *qp); @@ -90,4 +135,27 @@ void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t prov_errno); +const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, + size_t *buflen); + +struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, + fi_addr_t addr); + +const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, + fi_addr_t addr, char *buf, + size_t *buflen); + +struct efa_cq *efa_base_ep_get_tx_cq(struct efa_base_ep *ep); + +struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep); + +int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *base_ep, + enum ibv_wr_opcode op_code); + +int efa_base_ep_insert_cntr_ibv_cq_poll_list(struct efa_base_ep *ep); + +void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep); + +int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp); + #endif diff --git a/prov/efa/src/efa_cntr.c b/prov/efa/src/efa_cntr.c index 3a014c1c614..c30a3d862d4 100644 --- a/prov/efa/src/efa_cntr.c +++ b/prov/efa/src/efa_cntr.c @@ -161,13 +161,13 @@ static void efa_rdm_cntr_progress(struct util_cntr *cntr) * some idle endpoints and never poll completions for them. Move these initial posts to * the first polling before having a long term fix. */ - if (!efa_cntr->initial_rx_to_all_eps_posted) { + if (efa_cntr->need_to_scan_ep_list) { dlist_foreach(&cntr->ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); efa_rdm_ep = container_of(fid_entry->fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); } - efa_cntr->initial_rx_to_all_eps_posted = true; + efa_cntr->need_to_scan_ep_list = false; } dlist_foreach(&efa_cntr->ibv_cq_poll_list, item) { @@ -178,6 +178,22 @@ static void efa_rdm_cntr_progress(struct util_cntr *cntr) ofi_genlock_unlock(&cntr->ep_list_lock); } +static void efa_cntr_progress(struct util_cntr *cntr) +{ + struct dlist_entry *item; + struct efa_ibv_cq_poll_list_entry *poll_list_entry; + struct efa_cntr *efa_cntr; + + efa_cntr = container_of(cntr, struct efa_cntr, util_cntr); + + ofi_genlock_lock(&cntr->ep_list_lock); + dlist_foreach(&efa_cntr->ibv_cq_poll_list, item) { + poll_list_entry = container_of(item, struct efa_ibv_cq_poll_list_entry, entry); + efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, poll_list_entry->cq); + } + ofi_genlock_unlock(&cntr->ep_list_lock); +} + int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context) { @@ -193,13 +209,13 @@ int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, return -FI_ENOMEM; dlist_init(&cntr->ibv_cq_poll_list); - cntr->initial_rx_to_all_eps_posted = false; + cntr->need_to_scan_ep_list = false; efa_domain = container_of(domain, struct efa_domain, util_domain.domain_fid); cntr_progress_func = efa_domain->info->ep_attr->type == FI_EP_RDM ? efa_rdm_cntr_progress - : ofi_cntr_progress; + : efa_cntr_progress; ret = ofi_cntr_init(&efa_prov, domain, attr, &cntr->util_cntr, cntr_progress_func, context); diff --git a/prov/efa/src/efa_cntr.h b/prov/efa/src/efa_cntr.h index 05227159d49..bcfde8784a2 100644 --- a/prov/efa/src/efa_cntr.h +++ b/prov/efa/src/efa_cntr.h @@ -13,7 +13,7 @@ struct efa_cntr { struct fid_cntr *shm_cntr; struct dlist_entry ibv_cq_poll_list; /* Only used by RDM EP type */ - bool initial_rx_to_all_eps_posted; + bool need_to_scan_ep_list; }; int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c new file mode 100644 index 00000000000..d5bfdb2c949 --- /dev/null +++ b/prov/efa/src/efa_cq.c @@ -0,0 +1,473 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include +#include "config.h" +#include +#include "efa.h" +#include "efa_av.h" +#include "efa_cntr.h" +#include "efa_cq.h" +#include + + +static inline uint64_t efa_cq_opcode_to_fi_flags(enum ibv_wc_opcode opcode) { + switch (opcode) { + case IBV_WC_SEND: + return FI_SEND | FI_MSG; + case IBV_WC_RECV: + return FI_RECV | FI_MSG; + case IBV_WC_RDMA_WRITE: + return FI_RMA | FI_WRITE; + case IBV_WC_RECV_RDMA_WITH_IMM: + return FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE; + case IBV_WC_RDMA_READ: + return FI_RMA | FI_READ; + default: + assert(0); + return 0; + } +} + +static void efa_cq_construct_cq_entry(struct ibv_cq_ex *ibv_cqx, + struct fi_cq_tagged_entry *entry) +{ + entry->op_context = (void *)ibv_cqx->wr_id; + if (ibv_cqx->wr_id) + entry->flags = ((struct efa_context *) ibv_cqx->wr_id)->completion_flags; + else + entry->flags = efa_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); + entry->len = ibv_wc_read_byte_len(ibv_cqx); + entry->buf = NULL; + entry->data = 0; + entry->tag = 0; + + if (ibv_wc_read_wc_flags(ibv_cqx) & IBV_WC_WITH_IMM) { + entry->flags |= FI_REMOTE_CQ_DATA; + entry->data = ibv_wc_read_imm_data(ibv_cqx); + } +} + +/** + * @brief handle the situation that a TX/RX operation encountered error + * + * This function does the following to handle error: + * + * 1. write an error cq entry for the operation, if writing + * CQ error entry failed, it will write eq entry. + * + * 2. increase error counter. + * + * 3. print warning message with self and peer's raw address + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] err positive libfabric error code + * @param[in] prov_errno positive EFA provider specific error code + * @param[in] is_tx if the error is for TX or RX operation + */ +static void efa_cq_handle_error(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, int err, + int prov_errno, bool is_tx) +{ + struct fi_cq_err_entry err_entry; + fi_addr_t addr; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; + int write_cq_err; + + memset(&err_entry, 0, sizeof(err_entry)); + efa_cq_construct_cq_entry(ibv_cq_ex, (struct fi_cq_tagged_entry *) &err_entry); + err_entry.err = err; + err_entry.prov_errno = prov_errno; + + if (is_tx) + addr = ibv_cq_ex->wr_id ? ((struct efa_context *)ibv_cq_ex->wr_id)->addr : FI_ADDR_NOTAVAIL; + else + addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + + if (OFI_UNLIKELY(efa_write_error_msg(base_ep, addr, prov_errno, + err_msg, + &err_entry.err_data_size))) { + err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; + } + + EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", + err_entry.err, + err_entry.err_data + ? (const char *) err_entry.err_data + : efa_strerror(err_entry.prov_errno), + err_entry.prov_errno); + + efa_show_help(err_entry.prov_errno); + + efa_cntr_report_error(&base_ep->util_ep, err_entry.flags); + write_cq_err = ofi_cq_write_error(is_tx ? base_ep->util_ep.tx_cq : + base_ep->util_ep.rx_cq, + &err_entry); + if (write_cq_err) { + EFA_WARN( + FI_LOG_CQ, + "Error writing error cq entry when handling %s error\n", + is_tx ? "TX" : "RX"); + efa_base_ep_write_eq_error(base_ep, err, prov_errno); + } +} + +/** + * @brief handle the event that a TX request has been completed + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] cq_entry fi_cq_tagged_entry + */ +static void efa_cq_handle_tx_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, + struct fi_cq_tagged_entry *cq_entry) +{ + struct util_cq *tx_cq = base_ep->util_ep.tx_cq; + int ret = 0; + + /* NULL wr_id means no FI_COMPLETION flag */ + if (!ibv_cq_ex->wr_id) + return; + + /* TX completions should not send peer address to util_cq */ + if (base_ep->util_ep.caps & FI_SOURCE) + ret = ofi_cq_write_src(tx_cq, cq_entry->op_context, + cq_entry->flags, cq_entry->len, + cq_entry->buf, cq_entry->data, + cq_entry->tag, FI_ADDR_NOTAVAIL); + else + ret = ofi_cq_write(tx_cq, cq_entry->op_context, cq_entry->flags, + cq_entry->len, cq_entry->buf, cq_entry->data, + cq_entry->tag); + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, "Unable to write send completion: %s\n", + fi_strerror(-ret)); + efa_cq_handle_error(base_ep, ibv_cq_ex, -ret, + FI_EFA_ERR_WRITE_SEND_COMP, true); + } +} + +/** + * @brief handle the event that a RX request has been completed + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] cq_entry fi_cq_tagged_entry + */ +static void efa_cq_handle_rx_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, + struct fi_cq_tagged_entry *cq_entry) +{ + struct util_cq *rx_cq = base_ep->util_ep.rx_cq; + fi_addr_t src_addr; + int ret = 0; + + /* NULL wr_id means no FI_COMPLETION flag */ + if (!ibv_cq_ex->wr_id) + return; + + if (base_ep->util_ep.caps & FI_SOURCE) { + src_addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + ret = ofi_cq_write_src(rx_cq, cq_entry->op_context, + cq_entry->flags, cq_entry->len, + cq_entry->buf, cq_entry->data, + cq_entry->tag, src_addr); + } else { + ret = ofi_cq_write(rx_cq, cq_entry->op_context, cq_entry->flags, + cq_entry->len, cq_entry->buf, cq_entry->data, + cq_entry->tag); + } + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, "Unable to write recv completion: %s\n", + fi_strerror(-ret)); + efa_cq_handle_error(base_ep, ibv_cq_ex, -ret, + FI_EFA_ERR_WRITE_RECV_COMP, false); + } +} + +/** + * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM + * + * This function handles hardware-assisted RDMA writes with immediate data at + * remote endpoint. These do not have a packet context, nor do they have a + * connid available. + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + */ +static void +efa_cq_proc_ibv_recv_rdma_with_imm_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex) +{ + struct util_cq *rx_cq = base_ep->util_ep.rx_cq; + int ret; + fi_addr_t src_addr; + uint32_t imm_data = ibv_wc_read_imm_data(ibv_cq_ex); + uint32_t len = ibv_wc_read_byte_len(ibv_cq_ex); + uint64_t flags = FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE; + + if (base_ep->util_ep.caps & FI_SOURCE) { + src_addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + ret = ofi_cq_write_src(rx_cq, NULL, flags, len, NULL, imm_data, + 0, src_addr); + } else { + ret = ofi_cq_write(rx_cq, NULL, flags, len, NULL, imm_data, 0); + } + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, + "Unable to write a cq entry for remote for RECV_RDMA " + "operation: %s\n", + fi_strerror(-ret)); + efa_base_ep_write_eq_error(base_ep, -ret, + FI_EFA_ERR_WRITE_RECV_COMP); + } +} + +/** + * @brief poll rdma-core cq and process the cq entry + * + * @param[in] cqe_to_process Max number of cq entry to poll and process. + * A negative number means to poll until cq empty. + * @param[in] util_cq util_cq + */ +void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) +{ + bool should_end_poll = false; + struct efa_base_ep *base_ep; + struct efa_cq *cq; + struct efa_domain *efa_domain; + struct fi_cq_tagged_entry cq_entry = {0}; + struct fi_cq_err_entry err_entry; + ssize_t err = 0; + size_t num_cqe = 0; /* Count of read entries */ + int prov_errno, opcode; + + /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. + * EFA expects .comp_mask = 0, or otherwise returns EINVAL. + */ + struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; + + cq = container_of(ibv_cq, struct efa_cq, ibv_cq); + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + + /* Call ibv_start_poll only once */ + err = ibv_start_poll(cq->ibv_cq.ibv_cq_ex, &poll_cq_attr); + should_end_poll = !err; + + while (!err) { + base_ep = efa_domain->qp_table[ibv_wc_read_qp_num(cq->ibv_cq.ibv_cq_ex) & efa_domain->qp_table_sz_m1]->base_ep; + opcode = ibv_wc_read_opcode(cq->ibv_cq.ibv_cq_ex); + if (cq->ibv_cq.ibv_cq_ex->status) { + prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq.ibv_cq_ex); + switch (opcode) { + case IBV_WC_SEND: /* fall through */ + case IBV_WC_RDMA_WRITE: /* fall through */ + case IBV_WC_RDMA_READ: + efa_cq_handle_error(base_ep, cq->ibv_cq.ibv_cq_ex, + to_fi_errno(prov_errno), + prov_errno, true); + break; + case IBV_WC_RECV: /* fall through */ + case IBV_WC_RECV_RDMA_WITH_IMM: + if (efa_cq_wc_is_unsolicited(cq->ibv_cq.ibv_cq_ex)) { + EFA_WARN(FI_LOG_CQ, + "Receive error %s (%d) for " + "unsolicited write recv", + efa_strerror(prov_errno), + prov_errno); + efa_base_ep_write_eq_error( + base_ep, + to_fi_errno(prov_errno), + prov_errno); + break; + } + efa_cq_handle_error(base_ep, cq->ibv_cq.ibv_cq_ex, + to_fi_errno(prov_errno), + prov_errno, false); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode); + assert(0 && "Unhandled op code"); + } + break; + } + + efa_cq_construct_cq_entry(cq->ibv_cq.ibv_cq_ex, &cq_entry); + + switch (opcode) { + case IBV_WC_SEND: /* fall through */ + case IBV_WC_RDMA_WRITE: /* fall through */ + case IBV_WC_RDMA_READ: + efa_cq_handle_tx_completion(base_ep, cq->ibv_cq.ibv_cq_ex, &cq_entry); + efa_cntr_report_tx_completion(&base_ep->util_ep, cq_entry.flags); + break; + case IBV_WC_RECV: + efa_cq_handle_rx_completion(base_ep, cq->ibv_cq.ibv_cq_ex, &cq_entry); + efa_cntr_report_rx_completion(&base_ep->util_ep, cq_entry.flags); + break; + case IBV_WC_RECV_RDMA_WITH_IMM: + efa_cq_proc_ibv_recv_rdma_with_imm_completion( + base_ep, cq->ibv_cq.ibv_cq_ex); + efa_cntr_report_rx_completion(&base_ep->util_ep, cq_entry.flags); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, + "Unhandled cq type\n"); + assert(0 && "Unhandled cq type"); + } + + num_cqe++; + if (num_cqe == cqe_to_process) { + break; + } + + err = ibv_next_poll(cq->ibv_cq.ibv_cq_ex); + } + + if (err && err != ENOENT) { + err = err > 0 ? err : -err; + prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq.ibv_cq_ex); + EFA_WARN(FI_LOG_CQ, + "Unexpected error when polling ibv cq, err: %s (%zd) " + "prov_errno: %s (%d)\n", + fi_strerror(err), err, efa_strerror(prov_errno), + prov_errno); + efa_show_help(prov_errno); + err_entry = (struct fi_cq_err_entry) { + .err = err, + .prov_errno = prov_errno, + .op_context = NULL, + }; + ofi_cq_write_error(&cq->util_cq, &err_entry); + } + + if (should_end_poll) + ibv_end_poll(cq->ibv_cq.ibv_cq_ex); +} + +static const char *efa_cq_strerror(struct fid_cq *cq_fid, + int prov_errno, + const void *err_data, + char *buf, size_t len) +{ + return err_data + ? (const char *) err_data + : efa_strerror(prov_errno); +} + +static struct fi_ops_cq efa_cq_ops = { + .size = sizeof(struct fi_ops_cq), + .read = ofi_cq_read, + .readfrom = ofi_cq_readfrom, + .readerr = ofi_cq_readerr, + .sread = fi_no_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = efa_cq_strerror +}; + +void efa_cq_progress(struct util_cq *cq) +{ + struct efa_cq *efa_cq = container_of(cq, struct efa_cq, util_cq); + + efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, &efa_cq->ibv_cq); +} + +static int efa_cq_close(fid_t fid) +{ + struct efa_cq *cq; + int ret; + + cq = container_of(fid, struct efa_cq, util_cq.cq_fid.fid); + + if (cq->ibv_cq.ibv_cq_ex) { + ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", + fi_strerror(-ret)); + return ret; + } + cq->ibv_cq.ibv_cq_ex = NULL; + } + + ret = ofi_cq_cleanup(&cq->util_cq); + if (ret) + return ret; + + free(cq); + + return 0; +} + +static struct fi_ops efa_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = efa_cq_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + + +int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct efa_cq *cq; + struct efa_domain *efa_domain; + int err, retv; + + if (attr->wait_obj != FI_WAIT_NONE) + return -FI_ENOSYS; + + cq = calloc(1, sizeof(*cq)); + if (!cq) + return -FI_ENOMEM; + + err = ofi_cq_init(&efa_prov, domain_fid, attr, &cq->util_cq, + &efa_cq_progress, context); + if (err) { + EFA_WARN(FI_LOG_CQ, "Unable to create UTIL_CQ\n"); + goto err_free_cq; + } + + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, + util_domain); + err = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, + &cq->ibv_cq.ibv_cq_ex, + &cq->ibv_cq.ibv_cq_ex_type); + if (err) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", fi_strerror(err)); + goto err_free_util_cq; + } + + *cq_fid = &cq->util_cq.cq_fid; + (*cq_fid)->fid.fclass = FI_CLASS_CQ; + (*cq_fid)->fid.context = context; + (*cq_fid)->fid.ops = &efa_cq_fi_ops; + (*cq_fid)->ops = &efa_cq_ops; + + return 0; + +err_free_util_cq: + retv = ofi_cq_cleanup(&cq->util_cq); + if (retv) + EFA_WARN(FI_LOG_CQ, "Unable to close util cq: %s\n", + fi_strerror(-retv)); +err_free_cq: + free(cq); + return err; +} diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index 18cf435023b..efdf2cb15db 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +#ifndef _EFA_CQ_H +#define _EFA_CQ_H + #include "efa.h" enum ibv_cq_ex_type { @@ -18,6 +21,16 @@ struct efa_ibv_cq_poll_list_entry { struct efa_ibv_cq *cq; }; +struct efa_cq { + struct util_cq util_cq; + struct efa_ibv_cq ibv_cq; +}; + +/* + * Control header with completion data. CQ data length is static. + */ +#define EFA_CQ_DATA_SIZE (4) + static inline int efa_ibv_cq_poll_list_match(struct dlist_entry *entry, const void *cq) { @@ -131,7 +144,7 @@ static inline int efa_cq_ibv_cq_ex_open(struct fi_cq_attr *attr, }; #if HAVE_CAPS_UNSOLICITED_WRITE_RECV - if (efa_rdm_use_unsolicited_write_recv()) + if (efa_use_unsolicited_write_recv()) efadv_cq_init_attr.wc_flags |= EFADV_WC_EX_WITH_IS_UNSOLICITED; #endif @@ -171,3 +184,95 @@ static inline int efa_cq_ibv_cq_ex_open(struct fi_cq_attr *attr, &init_attr_ex, ibv_ctx, ibv_cq_ex, ibv_cq_ex_type); } #endif + +int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context); + +void efa_cq_progress(struct util_cq *cq); + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV +/** + * @brief Check whether a completion consumes recv buffer + * + * @param ibv_cq_ex extended ibv cq + * @return true the wc consumes a recv buffer + * @return false the wc doesn't consume a recv buffer + */ +static inline +bool efa_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) +{ + return efa_use_unsolicited_write_recv() && efadv_wc_is_unsolicited(efadv_cq_from_ibv_cq_ex(ibv_cq_ex)); +} + +#else + +static inline +bool efa_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) +{ + return false; +} + +#endif + +/** + * @brief Write the error message and return its byte length + * @param[in] ep EFA base endpoint + * @param[in] addr Remote peer fi_addr_t + * @param[in] prov_errno EFA provider * error code(must be positive) + * @param[out] err_msg Pointer to the address of error message written by + * this function + * @param[out] buflen Pointer to the returned error data size + * @return A status code. 0 if the error data was written successfully, + * otherwise a negative FI error code. + */ +static inline int efa_write_error_msg(struct efa_base_ep *ep, fi_addr_t addr, + int prov_errno, char *err_msg, + size_t *buflen) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + const char *base_msg = efa_strerror(prov_errno); + size_t len = 0; + uint64_t local_host_id; + + *buflen = 0; + + len = sizeof(ep_addr_str); + efa_base_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_base_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + local_host_id = efa_get_host_id(efa_env.host_id_file); + if (!local_host_id || + EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, + EFA_HOST_ID_STRING_LENGTH + 1, + "i-%017lx", local_host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + /* efa-raw cannot get peer host id without a handshake */ + strcpy(peer_host_id_str, "N/A"); + + int ret = snprintf(err_msg, EFA_ERROR_MSG_BUFFER_LENGTH, + "%s My EFA addr: %s My host id: %s Peer EFA addr: " + "%s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, + peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > EFA_ERROR_MSG_BUFFER_LENGTH - 1) { + return -FI_EINVAL; + } + + if (strlen(err_msg) >= EFA_ERROR_MSG_BUFFER_LENGTH) { + return -FI_ENOBUFS; + } + + *buflen = EFA_ERROR_MSG_BUFFER_LENGTH; + + return 0; +} + +void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq); + +#endif /* end of _EFA_CQ_H*/ \ No newline at end of file diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 130cfc052a9..34de62cebac 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -11,8 +11,6 @@ #include "efa_cntr.h" #include "rdm/efa_rdm_cq.h" #include "rdm/efa_rdm_atomic.h" -#include "dgram/efa_dgram_ep.h" -#include "dgram/efa_dgram_cq.h" struct dlist_entry g_efa_domain_list; @@ -33,8 +31,8 @@ static struct fi_ops efa_ops_domain_fid = { static struct fi_ops_domain efa_ops_domain_dgram = { .size = sizeof(struct fi_ops_domain), .av_open = efa_av_open, - .cq_open = efa_dgram_cq_open, - .endpoint = efa_dgram_ep_open, + .cq_open = efa_cq_open, + .endpoint = efa_ep_open, .scalable_ep = fi_no_scalable_ep, .cntr_open = efa_cntr_open, .poll_open = fi_no_poll_open, @@ -290,19 +288,15 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, efa_domain->util_domain.domain_fid.ops = &efa_ops_domain_dgram; } - err = efa_fork_support_enable_if_requested(*domain_fid); +#ifndef _WIN32 + err = efa_fork_support_install_fork_handler(); if (err) { - ret = err; - EFA_WARN(FI_LOG_DOMAIN, "Failed to initialize fork support. err: %d\n", ret); - goto err_free; - } - - err = efa_domain_hmem_info_init_all(efa_domain); - if (err) { - ret = err; - EFA_WARN(FI_LOG_DOMAIN, "Failed to check hmem support status. err: %d\n", ret); - goto err_free; + EFA_WARN(FI_LOG_CORE, + "Unable to install fork handler: %s\n", + strerror(-err)); + return err; } +#endif dlist_insert_tail(&efa_domain->list_entry, &g_efa_domain_list); return 0; diff --git a/prov/efa/src/efa_domain.h b/prov/efa/src/efa_domain.h index 2eaf7fc06ed..6fa13e0bd8d 100644 --- a/prov/efa/src/efa_domain.h +++ b/prov/efa/src/efa_domain.h @@ -22,7 +22,6 @@ struct efa_domain { struct ofi_mr_cache *cache; struct efa_qp **qp_table; size_t qp_table_sz_m1; - struct efa_hmem_info hmem_info[OFI_HMEM_MAX]; size_t mtu_size; size_t addrlen; bool mr_local; diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 484f544ddd6..d35c1cc9bde 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -34,11 +34,11 @@ struct efa_env efa_env = { .efa_max_gdrcopy_msg_size = 32768, .efa_read_segment_size = 1073741824, .efa_write_segment_size = 1073741824, /* need to confirm this constant. */ - .rnr_retry = 3, /* Setting this value to EFA_RNR_INFINITE_RETRY makes the firmware retry indefinitey */ .host_id_file = "/sys/devices/virtual/dmi/id/board_asset_tag", /* Available on EC2 instances and containers */ .use_sm2 = false, .huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC, .use_unsolicited_write_recv = 1, + .internal_rx_refill_threshold = 8, }; /** @@ -132,6 +132,7 @@ void efa_env_param_get(void) &efa_mr_max_cached_size); fi_param_get_size_t(&efa_prov, "tx_size", &efa_env.tx_size); fi_param_get_size_t(&efa_prov, "rx_size", &efa_env.rx_size); + fi_param_get_size_t(&efa_prov, "internal_rx_refill_threshold", &efa_env.internal_rx_refill_threshold); fi_param_get_bool(&efa_prov, "rx_copy_unexp", &efa_env.rx_copy_unexp); fi_param_get_bool(&efa_prov, "rx_copy_ooo", @@ -216,7 +217,7 @@ void efa_env_define() fi_param_define(&efa_prov, "inter_max_gdrcopy_message_size", FI_PARAM_INT, "The maximum message size to use gdrcopy. If instance support gdrcopy, messages whose size is smaller than this value will be sent by eager/longcts protocol (Default 32768)."); fi_param_define(&efa_prov, "inter_min_read_write_size", FI_PARAM_INT, - "The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536)."); + "The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536). If the efa device supports RDMA write, device RDMA write will always be used."); fi_param_define(&efa_prov, "inter_read_segment_size", FI_PARAM_INT, "Calls to RDMA read is segmented using this value."); fi_param_define(&efa_prov, "fork_safe", FI_PARAM_BOOL, @@ -232,6 +233,8 @@ void efa_env_define() "will use huge page unless FI_EFA_FORK_SAFE is set to 1/on/true."); fi_param_define(&efa_prov, "use_unsolicited_write_recv", FI_PARAM_BOOL, "Use device's unsolicited write recv functionality when it's available. (Default: true)"); + fi_param_define(&efa_prov, "internal_rx_refill_threshold", FI_PARAM_SIZE_T, + "The threshold that EFA provider will refill the internal rx pkt pool. (Default: %zu)", efa_env.internal_rx_refill_threshold); } diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index 6fdd83a4a21..16286bbd4bc 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -6,12 +6,6 @@ #include "efa_prov.h" -/** - * Setting ibv_qp_attr.rnr_retry to this number when modifying qp - * to cause firmware to retry indefinitely. - */ -#define EFA_RNR_INFINITE_RETRY 7 - enum efa_env_huge_page_setting { EFA_ENV_HUGE_PAGE_UNSPEC, /**< user did not set FI_EFA_USE_HUGE_PAGE, provider will decide whether to use huge page*/ @@ -48,17 +42,6 @@ struct efa_env { size_t efa_max_gdrcopy_msg_size; size_t efa_read_segment_size; size_t efa_write_segment_size; - /* If first attempt to send a packet failed, - * this value controls how many times firmware - * retries the send before it report an RNR error - * (via rdma-core error cq entry). - * - * The valid number is from - * 0 (no retry) - * to - * EFA_RNR_INFINITY_RETRY (retry infinitely) - */ - int rnr_retry; /** * The absolute path to a file that contains an EC2 instance id-like string. * If host_id_file is provided, the program will attempt to read the @@ -79,6 +62,12 @@ struct efa_env { int use_sm2; enum efa_env_huge_page_setting huge_page_setting; int use_unsolicited_write_recv; + /** + * The threshold that EFA provider will refill the internal rx pkt pool. + * When the number of internal rx pkts to post is lower than this threshold, + * the refill will be skipped. + */ + size_t internal_rx_refill_threshold; }; extern struct efa_env efa_env; diff --git a/prov/efa/src/efa_ep.c b/prov/efa/src/efa_ep.c new file mode 100644 index 00000000000..8aa3268adf2 --- /dev/null +++ b/prov/efa/src/efa_ep.c @@ -0,0 +1,449 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include "config.h" +#include "efa.h" +#include "efa_av.h" +#include "efa_cq.h" + +#include + +extern struct fi_ops_msg efa_msg_ops; +extern struct fi_ops_rma efa_rma_ops; + +static int efa_ep_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + struct efa_base_ep *ep; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_EFA_RNR_RETRY: + if (*optlen < sizeof(size_t)) + return -FI_ETOOSMALL; + *(size_t *)optval = ep->rnr_retry; + *optlen = sizeof(size_t); + break; + /* p2p is required for efa direct ep */ + case FI_OPT_FI_HMEM_P2P: + if (*optlen < sizeof(int)) + return -FI_ETOOSMALL; + *(int *)optval = FI_HMEM_P2P_REQUIRED; + *optlen = sizeof(int); + break; + case FI_OPT_MAX_MSG_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->max_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_MAX_RMA_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->max_rma_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_MSG_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->inject_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_RMA_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->inject_rma_size; + *optlen = sizeof (size_t); + break; + /* Emulated read/write is NOT used for efa direct ep */ + case FI_OPT_EFA_EMULATED_READ: /* fall through */ + case FI_OPT_EFA_EMULATED_WRITE: + if (*optlen < sizeof(bool)) + return -FI_ETOOSMALL; + *(bool *)optval = false; + *optlen = sizeof(bool); + break; + default: + EFA_INFO(FI_LOG_EP_CTRL, "Unknown / unsupported endpoint option\n"); + return -FI_ENOPROTOOPT; + } + + return FI_SUCCESS; +} + +static int efa_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) +{ + int ret, intval; + struct efa_base_ep *ep; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_EFA_RNR_RETRY: + if (optlen != sizeof(size_t)) + return -FI_EINVAL; + + /* + * Application is required to call to fi_setopt before EP + * enabled. If it's calling to fi_setopt after EP enabled, + * fail the call. + * + * efa_ep->qp will be NULL before EP enabled, use it to check + * if the call to fi_setopt is before or after EP enabled for + * convience, instead of calling to ibv_query_qp + */ + if (ep->efa_qp_enabled) { + EFA_WARN(FI_LOG_EP_CTRL, + "The option FI_OPT_EFA_RNR_RETRY is required " + "to be set before EP enabled\n"); + return -FI_EINVAL; + } + + if (!efa_domain_support_rnr_retry_modify(ep->domain)) { + EFA_WARN(FI_LOG_EP_CTRL, + "RNR capability is not supported\n"); + return -FI_ENOSYS; + } + ep->rnr_retry = *(size_t *)optval; + break; + case FI_OPT_FI_HMEM_P2P: + if (optlen != sizeof(int)) + return -FI_EINVAL; + + intval = *(int *)optval; + + if (intval == FI_HMEM_P2P_DISABLED) { + EFA_WARN(FI_LOG_EP_CTRL, "p2p is required by implementation\n"); + return -FI_EOPNOTSUPP; + } + break; + case FI_OPT_MAX_MSG_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, ep->max_msg_size, (size_t) ep->domain->device->ibv_port_attr.max_msg_sz) + break; + case FI_OPT_MAX_RMA_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, ep->max_rma_size, (size_t) ep->domain->device->max_rdma_size) + break; + case FI_OPT_INJECT_MSG_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, ep->inject_msg_size, (size_t) ep->domain->device->efa_attr.inline_buf_size) + break; + case FI_OPT_INJECT_RMA_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, ep->inject_rma_size, (size_t) 0) + break; + /* no op as efa direct ep will not use cuda api and shm in data transfer */ + case FI_OPT_CUDA_API_PERMITTED: /* fall through */ + case FI_OPT_SHARED_MEMORY_PERMITTED: + break; + /* no op as efa direct ep will always use rdma for rma operations in data transfer */ + case FI_OPT_EFA_USE_DEVICE_RDMA: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (!(*(bool *)optval) && (ep->info->caps & FI_RMA)) { + EFA_WARN(FI_LOG_EP_CTRL, "Device rdma is required for rma operations\n"); + return -FI_EOPNOTSUPP; + } + break; + case FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (*(bool *)optval) { + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(ep, IBV_WR_SEND); + if (ret) + return ret; + } + break; + case FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (*(bool *)optval) { + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(ep, IBV_WR_RDMA_WRITE); + if (ret) + return ret; + } + break; + default: + EFA_INFO(FI_LOG_EP_CTRL, "Unknown / unsupported endpoint option\n"); + return -FI_ENOPROTOOPT; + } + + return FI_SUCCESS; +} + +static struct fi_ops_ep efa_ep_base_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = efa_ep_getopt, + .setopt = efa_ep_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static int efa_ep_close(fid_t fid) +{ + struct efa_base_ep *ep; + int ret; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + /* We need to free the util_ep first to avoid race conditions + * with other threads progressing the cntr. */ + efa_base_ep_close_util_ep(ep); + + efa_base_ep_remove_cntr_ibv_cq_poll_list(ep); + + ret = efa_base_ep_destruct(ep); + if (ret) { + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); + } + + free(ep); + + return 0; +} + +static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct efa_base_ep *ep; + struct efa_cq *cq; + struct efa_av *av; + struct efa_domain *efa_domain; + struct util_eq *eq; + struct util_cntr *cntr; + int ret; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + ret = ofi_ep_bind_valid(&efa_prov, bfid, flags); + if (ret) + return ret; + + switch (bfid->fclass) { + case FI_CLASS_CQ: + if (flags & FI_SELECTIVE_COMPLETION) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint cannot be bound with selective completion.\n"); + return -FI_EBADFLAGS; + } + + /* Must bind a CQ to either RECV or SEND completions */ + if (!(flags & (FI_RECV | FI_TRANSMIT))) + return -FI_EBADFLAGS; + + cq = container_of(bfid, struct efa_cq, util_cq.cq_fid); + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + if (ep->domain != efa_domain) + return -FI_EINVAL; + + ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); + if (ret) + return ret; + + break; + case FI_CLASS_AV: + av = container_of(bfid, struct efa_av, util_av.av_fid.fid); + /* Bind util provider endpoint and av */ + ret = ofi_ep_bind_av(&ep->util_ep, &av->util_av); + if (ret) + return ret; + + ret = efa_base_ep_bind_av(ep, av); + if (ret) + return ret; + break; + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); + + ret = ofi_ep_bind_cntr(&ep->util_ep, cntr, flags); + if (ret) + return ret; + break; + case FI_CLASS_EQ: + eq = container_of(bfid, struct util_eq, eq_fid.fid); + + ret = ofi_ep_bind_eq(&ep->util_ep, eq); + if (ret) + return ret; + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, "invalid fid class\n"); + return -EINVAL; + } + + return 0; +} + +static int efa_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) +{ + struct efa_base_ep *ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_tx_attr *tx_attr = ep->info->tx_attr; + struct fi_rx_attr *rx_attr = ep->info->rx_attr; + + if ((*flags & FI_TRANSMIT) && (*flags & FI_RECV)) { + EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified\n"); + return -FI_EINVAL; + } else if (tx_attr && (*flags & FI_TRANSMIT)) { + *flags = tx_attr->op_flags; + } else if (rx_attr && (*flags & FI_RECV)) { + *flags = rx_attr->op_flags; + } else { + EFA_WARN(FI_LOG_EP_CTRL, "Tx/Rx flags not specified\n"); + return -FI_EINVAL; + } + return 0; +} + +static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) +{ + struct efa_base_ep *ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_tx_attr *tx_attr = ep->info->tx_attr; + struct fi_rx_attr *rx_attr = ep->info->rx_attr; + + if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) { + EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified.\n"); + return -FI_EINVAL; + } else if (tx_attr && (flags & FI_TRANSMIT)) { + tx_attr->op_flags = flags; + tx_attr->op_flags &= ~FI_TRANSMIT; + } else if (rx_attr && (flags & FI_RECV)) { + rx_attr->op_flags = flags; + rx_attr->op_flags &= ~FI_RECV; + } else { + EFA_WARN(FI_LOG_EP_CTRL, "Tx/Rx flags not specified\n"); + return -FI_EINVAL; + } + + return 0; +} + +static int efa_ep_enable(struct fid_ep *ep_fid) +{ + struct efa_base_ep *ep; + int err; + + ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + + err = efa_base_ep_create_and_enable_qp(ep, false); + if (err) + return err; + + err = efa_base_ep_insert_cntr_ibv_cq_poll_list(ep); + if (err) + efa_base_ep_destruct_qp(ep); + + return err; +} + +static int efa_ep_control(struct fid *fid, int command, void *arg) +{ + struct fid_ep *ep_fid; + + switch (fid->fclass) { + case FI_CLASS_EP: + ep_fid = container_of(fid, struct fid_ep, fid); + switch (command) { + case FI_GETOPSFLAG: + return efa_ep_getflags(ep_fid, (uint64_t *)arg); + case FI_SETOPSFLAG: + return efa_ep_setflags(ep_fid, *(uint64_t *)arg); + case FI_ENABLE: + return efa_ep_enable(ep_fid); + default: + return -FI_ENOSYS; + } + break; + default: + return -FI_ENOSYS; + } +} + +static struct fi_ops efa_ep_ops = { + .size = sizeof(struct fi_ops), + .close = efa_ep_close, + .bind = efa_ep_bind, + .control = efa_ep_control, + .ops_open = fi_no_ops_open, +}; + +/** + * @brief progress engine for the EFA dgram endpoint + * + * This function now a no-op. + * + * @param[in] util_ep The endpoint FID to progress + */ +static +void efa_ep_progress_no_op(struct util_ep *util_ep) +{ + return; +} + +static struct fi_ops_atomic efa_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = fi_no_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = fi_no_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = fi_no_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + +struct fi_ops_cm efa_ep_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = efa_base_ep_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, +}; + +int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, + struct fid_ep **ep_fid, void *context) +{ + struct efa_base_ep *ep; + int ret; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + ret = efa_base_ep_construct(ep, domain_fid, user_info, efa_ep_progress_no_op, context); + if (ret) + goto err_ep_destroy; + + *ep_fid = &ep->util_ep.ep_fid; + (*ep_fid)->fid.fclass = FI_CLASS_EP; + (*ep_fid)->fid.context = context; + (*ep_fid)->fid.ops = &efa_ep_ops; + (*ep_fid)->ops = &efa_ep_base_ops; + (*ep_fid)->msg = &efa_msg_ops; + (*ep_fid)->cm = &efa_ep_cm_ops; + (*ep_fid)->rma = &efa_rma_ops; + (*ep_fid)->atomic = &efa_atomic_ops; + + return 0; + +err_ep_destroy: + efa_base_ep_destruct(ep); + if (ep) + free(ep); + return ret; +} diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 2b61b2f0464..029c35d4a07 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -69,8 +69,9 @@ _(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \ _(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \ _(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \ - _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \ - _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) + _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \ + _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \ + _(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response)) /** * @brief EFA provider proprietary error codes @@ -104,7 +105,9 @@ _(4121, DGRAM_CQ_READ, Error reading from DGRAM CQ) \ _(4122, SHM_INTERNAL_ERROR, SHM internal error) \ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ - _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) + _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ + _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \ + _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) /** @} */ @@ -148,20 +151,22 @@ static inline int to_fi_errno(enum efa_errno err) { case EFA_IO_COMP_STATUS_OK: return FI_SUCCESS; case EFA_IO_COMP_STATUS_FLUSHED: - return FI_EHOSTDOWN; + return FI_ECANCELED; case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR: case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH: case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY: case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: return FI_EINVAL; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: return FI_EHOSTUNREACH; case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: return FI_EMSGSIZE; case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: return FI_ECONNABORTED; case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER: diff --git a/prov/efa/src/efa_fork_support.c b/prov/efa/src/efa_fork_support.c index 82db3505987..41ac57bd5f6 100644 --- a/prov/efa/src/efa_fork_support.c +++ b/prov/efa/src/efa_fork_support.c @@ -70,14 +70,13 @@ void efa_fork_support_request_initialize() * * This relies on internal behavior in rdma-core and is a temporary workaround. * - * @param domain_fid domain fid so we can register memory * @return 1 if fork support is enabled * 0 if not enabled * -FI_EINVAL/-FI_NOMEM on errors. */ -static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) +static int efa_fork_support_is_enabled() { - /* If ibv_is_fork_initialized is availble, check if the function + /* If ibv_is_fork_initialized is available, check if the function * can exit early. */ #if HAVE_IBV_IS_FORK_INITIALIZED == 1 @@ -86,17 +85,14 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) /* If fork support is ENABLED or UNNEEDED, return 1. */ return fork_status != IBV_FORK_DISABLED; #else - struct efa_domain *efa_domain; struct ibv_mr *mr = NULL; char *buf = NULL; int ret=0, ret_init=0; long page_size; - efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); - page_size = ofi_get_page_size(); if (page_size <= 0) { - EFA_WARN(FI_LOG_DOMAIN, "Unable to determine page size %ld\n", + EFA_WARN(FI_LOG_CORE, "Unable to determine page size %ld\n", page_size); return -FI_EINVAL; } @@ -105,8 +101,7 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) if (!buf) return -FI_ENOMEM; - - mr = ibv_reg_mr(efa_domain->ibv_pd, buf, page_size, 0); + mr = ibv_reg_mr(g_device_list[0].ibv_pd, buf, page_size, 0); if (mr == NULL) { ret = errno; goto out; @@ -125,14 +120,14 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) if(buf) free(buf); if(mr) ibv_dereg_mr(mr); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Unexpected error during ibv_reg_mr in " "efa_fork_support_is_enabled(): %s\n",strerror(ret)); return -FI_EINVAL; } - if (ret_init == 0) return 0; - if (ret_init == EINVAL) return 1; - EFA_WARN(FI_LOG_DOMAIN, + if (ret_init == 0) return 1; + if (ret_init == EINVAL) return 0; + EFA_WARN(FI_LOG_CORE, "Unexpected error during ibv_fork_init in " "efa_fork_support_is_enabled(): %s\n",strerror(ret_init)); return -FI_EINVAL; @@ -221,12 +216,10 @@ void efa_atfork_callback_flush_mr_cache() * library or process initiates a fork and we determined from previous logic * that we cannot support that. * - * @param domain_fid domain fid so we can check register memory during initialization. * @return error number if we failed to initialize, 0 otherwise */ -int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) +int efa_fork_support_enable_if_requested() { - static int fork_handler_installed = 0; int ret; int is_enabled; @@ -236,7 +229,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) if (g_efa_fork_status == EFA_FORK_SUPPORT_ON) { ret = -ibv_fork_init(); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Fork support requested but ibv_fork_init failed: %s\n", strerror(-ret)); return ret; @@ -249,7 +242,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) * this variable was set to ON during provider init. Huge pages for * bounce buffers will not be used if fork support is on. */ - ret = efa_fork_support_is_enabled(domain_fid); + ret = efa_fork_support_is_enabled(); if (ret < 0) return ret; is_enabled = ret; @@ -257,36 +250,12 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) g_efa_fork_status = EFA_FORK_SUPPORT_ON; if (g_efa_fork_status == EFA_FORK_SUPPORT_ON && getenv("RDMAV_HUGEPAGES_SAFE")) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Using libibverbs fork support and huge pages is not" " supported by the EFA provider.\n"); return -FI_EINVAL; } - /* - * It'd be better to install this during provider init (since that's - * only invoked once) but we need to do a memory registration for the - * fork check above. This can move to the provider init once that check - * is gone. - */ - if (!fork_handler_installed && g_efa_fork_status != EFA_FORK_SUPPORT_UNNEEDED) { - if (g_efa_fork_status == EFA_FORK_SUPPORT_OFF) { - ret = pthread_atfork(efa_atfork_callback_warn_and_abort, NULL, NULL); - } else { - assert(g_efa_fork_status == EFA_FORK_SUPPORT_ON); - ret = pthread_atfork(efa_atfork_callback_flush_mr_cache, NULL, NULL); - } - - if (ret) { - EFA_WARN(FI_LOG_DOMAIN, - "Unable to register atfork callback: %s\n", - strerror(-ret)); - return ret; - } - - fork_handler_installed = 1; - } - return 0; } @@ -296,13 +265,12 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) * * We check if fork is requested and return failure as fork is not supported on Windows * - * @param domain_fid domain unused * @return error number if fork is requested, 0 otherwise */ -int efa_fork_support_enable_if_requested(struct domain_fid* domain_fid) +int efa_fork_support_enable_if_requested() { if (g_efa_fork_status == EFA_FORK_SUPPORT_ON) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Using fork support is not supported by the EFA provider on Windows\n"); return -FI_EINVAL; } @@ -311,3 +279,36 @@ int efa_fork_support_enable_if_requested(struct domain_fid* domain_fid) #endif +/* @brief + * + * install a fork handler to ensure that we abort if another + * library or process initiates a fork and we determined from previous logic + * that we cannot support that. + * + * @return error number if we failed to install, 0 otherwise + */ +int efa_fork_support_install_fork_handler() +{ + static int fork_handler_installed = 0; + int ret; + + if (!fork_handler_installed && g_efa_fork_status != EFA_FORK_SUPPORT_UNNEEDED) { + if (g_efa_fork_status == EFA_FORK_SUPPORT_OFF) { + ret = pthread_atfork(efa_atfork_callback_warn_and_abort, NULL, NULL); + } else { + assert(g_efa_fork_status == EFA_FORK_SUPPORT_ON); + ret = pthread_atfork(efa_atfork_callback_flush_mr_cache, NULL, NULL); + } + + if (ret) { + EFA_WARN(FI_LOG_CORE, + "Unable to register atfork callback: %s\n", + strerror(-ret)); + return ret; + } + + fork_handler_installed = 1; + } + + return 0; +} diff --git a/prov/efa/src/efa_fork_support.h b/prov/efa/src/efa_fork_support.h index ef16c23d577..13e692c0fdf 100644 --- a/prov/efa/src/efa_fork_support.h +++ b/prov/efa/src/efa_fork_support.h @@ -17,8 +17,10 @@ enum efa_fork_support_status { }; extern enum efa_fork_support_status g_efa_fork_status; -int efa_fork_support_enable_if_requested(struct fid_domain *domain_fid); +int efa_fork_support_enable_if_requested(); void efa_fork_support_request_initialize(); +int efa_fork_support_install_fork_handler(); + #endif diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 15f2513bf79..18dba70ca2c 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -5,16 +5,18 @@ #include "efa_hmem.h" #include "rdm/efa_rdm_pkt_type.h" +struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX]; + #if HAVE_CUDA || HAVE_NEURON -static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) { +static size_t efa_max_eager_msg_size_with_largest_header() { int mtu_size; - mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size; + mtu_size = g_device_list[0].rdm_info->ep_attr->max_msg_size; return mtu_size - efa_rdm_pkt_type_get_max_hdr_size(); } #else -static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) { +static size_t efa_max_eager_msg_size_with_largest_header() { return 0; } #endif @@ -23,14 +25,13 @@ static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_ * @brief Initialize the various protocol thresholds tracked in efa_hmem_info * according to the given FI_HMEM interface. * - * @param[in,out] efa_domain Pointer to struct efa_domain * @param[in] iface The FI_HMEM interface to initialize * * @return 0 */ -static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_domain, enum fi_hmem_iface iface) +static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; size_t tmp_value; /* Fall back to FI_HMEM_SYSTEM initialization logic when p2p is @@ -53,13 +54,13 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ case FI_HMEM_CUDA: info->runt_size = EFA_DEFAULT_RUNT_SIZE; info->max_medium_msg_size = 0; - info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; - info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; + info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1; + info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1; fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size); fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); if (-FI_ENODATA != fi_param_get(&efa_prov, "inter_max_medium_message_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, " "but EFA HMEM via Cuda API only supports eager and runting read protocols. " "The variable will not modify CUDA memory run config.\n"); @@ -68,13 +69,13 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ case FI_HMEM_NEURON: info->runt_size = EFA_NEURON_RUNT_SIZE; info->max_medium_msg_size = 0; - info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; - info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; + info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1; + info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1; fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size); fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); if (-FI_ENODATA != fi_param_get(&efa_prov, "inter_max_medium_message_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, " "but EFA HMEM via Neuron API only supports eager and runting read protocols. " "The variable will not modify CUDA memory run config.\n"); @@ -89,7 +90,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ -FI_ENODATA != fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &tmp_value) || -FI_ENODATA != fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &tmp_value) || -FI_ENODATA != fi_param_get_size_t(&efa_prov, "runt_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "One or more of the following environment variable(s) were set: [" "FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE, " "FI_EFA_INTER_MIN_READ_MESSAGE_SIZE, " @@ -105,7 +106,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ return 0; } -static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) { +static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) { #if HAVE_CUDA cudaError_t cuda_ret; void *ptr = NULL; @@ -119,7 +120,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i cuda_ret = ofi_cudaMalloc(&ptr, len); if (cuda_ret != cudaSuccess) { info->initialized = false; - EFA_WARN(FI_LOG_DOMAIN, "Failed to allocate CUDA buffer: %s\n", + EFA_WARN(FI_LOG_CORE, "Failed to allocate CUDA buffer: %s\n", ofi_cudaGetErrorString(cuda_ret)); return; } @@ -128,14 +129,15 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i if (ret == FI_SUCCESS) { ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); + (void)cuda_put_dmabuf_fd(dmabuf_fd); if (!ibv_mr) { - EFA_INFO(FI_LOG_DOMAIN, + EFA_INFO(FI_LOG_CORE, "Unable to register CUDA device buffer via dmabuf: %s. " "Fall back to ibv_reg_mr\n", fi_strerror(-errno)); ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); } } else { - EFA_INFO(FI_LOG_DOMAIN, + EFA_INFO(FI_LOG_CORE, "Unable to retrieve dmabuf fd of CUDA device buffer: %d. " "Fall back to ibv_reg_mr\n", ret); ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); @@ -146,7 +148,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i if (!ibv_mr) { info->p2p_supported_by_device = false; - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to register CUDA buffer with the EFA device, FI_HMEM transfers that require peer to peer support will fail.\n"); ofi_cudaFree(ptr); return; @@ -155,7 +157,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i ret = ibv_dereg_mr(ibv_mr); ofi_cudaFree(ptr); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to deregister CUDA buffer: %s\n", fi_strerror(-ret)); return; @@ -168,7 +170,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i return; } -static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) { +static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) { #if HAVE_NEURON struct ibv_mr *ibv_mr = NULL; int ibv_access = IBV_ACCESS_LOCAL_WRITE; @@ -191,7 +193,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem */ if (!ptr) { info->initialized = false; - EFA_INFO(FI_LOG_DOMAIN, "Cannot allocate Neuron buffer\n"); + EFA_INFO(FI_LOG_CORE, "Cannot allocate Neuron buffer\n"); return; } @@ -214,7 +216,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem if (!ibv_mr) { info->p2p_supported_by_device = false; /* We do not expect to support Neuron on non p2p systems */ - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to register Neuron buffer with the EFA device, " "FI_HMEM transfers that require peer to peer support will fail.\n"); neuron_free(&handle); @@ -224,7 +226,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem ret = ibv_dereg_mr(ibv_mr); neuron_free(&handle); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to deregister Neuron buffer: %s\n", fi_strerror(-ret)); return; @@ -239,64 +241,49 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem /** * @brief Initialize the efa_hmem_info state for iface * - * @param[in,out] efa_domain Pointer to struct efa_domain * @param[in] iface HMEM interface */ static void -efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_iface iface) +efa_hmem_info_init_iface(enum fi_hmem_iface iface) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; if (!ofi_hmem_is_initialized(iface)) { - EFA_INFO(FI_LOG_DOMAIN, "%s is not initialized\n", + EFA_INFO(FI_LOG_CORE, "%s is not initialized\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); return; } if ((iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_NEURON) && !efa_device_support_rdma_read()) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "No EFA RDMA read support, transfers using %s will fail.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); return; } info->initialized = true; - info->p2p_disabled_by_user = (iface == FI_HMEM_SYSTEM) ? false : ofi_hmem_p2p_disabled(); if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) { info->p2p_supported_by_device = true; - } else if (info->p2p_disabled_by_user) { + } else if (ofi_hmem_p2p_disabled()) { info->p2p_supported_by_device = false; } else { if (iface == FI_HMEM_CUDA) - efa_domain_hmem_info_check_p2p_support_cuda(info); + efa_hmem_info_check_p2p_support_cuda(info); if (iface == FI_HMEM_NEURON) - efa_domain_hmem_info_check_p2p_support_neuron(info); + efa_hmem_info_check_p2p_support_neuron(info); if (!info->p2p_supported_by_device) - EFA_INFO(FI_LOG_DOMAIN, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + EFA_INFO(FI_LOG_CORE, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); } - info->p2p_required_by_impl = true; - /* If user is using libfabric API 1.18 or later, by default EFA - * provider is permitted to use CUDA library to support CUDA - * memory, therefore p2p is not required. - */ - if (iface == FI_HMEM_CUDA && - FI_VERSION_GE(efa_domain->util_domain.fabric->fabric_fid.api_version, FI_VERSION(1, 18))) - info->p2p_required_by_impl = !hmem_ops[iface].initialized; - if (iface == FI_HMEM_SYSTEM) - info->p2p_required_by_impl = false; - - efa_domain_hmem_info_init_protocol_thresholds(efa_domain, iface); + efa_domain_hmem_info_init_protocol_thresholds(iface); } /** * @brief Validate an FI_OPT_FI_HMEM_P2P (FI_OPT_ENDPOINT) option for a * specified HMEM interface. - * Also update hmem_info[iface]->p2p_disabled_by_user accordingly. * - * @param[in,out] domain The efa_domain struct which contains an efa_hmem_info array * @param[in] iface The fi_hmem_iface enum of the FI_HMEM interface to validate * @param[in] p2p_opt The P2P option to validate * @@ -305,9 +292,9 @@ efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_ifac * -FI_ENODATA if the given HMEM interface was not initialized * -FI_EINVAL if p2p_opt is not a valid FI_OPT_FI_HMEM_P2P option */ -int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt) +int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; if (OFI_UNLIKELY(!info->initialized)) return -FI_ENODATA; @@ -317,7 +304,6 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem if (OFI_UNLIKELY(ofi_hmem_p2p_disabled()) || !info->p2p_supported_by_device) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = false; return 0; /* * According to fi_setopt() document: @@ -334,14 +320,13 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem if (OFI_UNLIKELY(ofi_hmem_p2p_disabled())) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = false; return 0; case FI_HMEM_P2P_DISABLED: - if (info->p2p_required_by_impl) + /* return -FI_EOPNOTSUPP if p2p is required by implementation */ + if (iface != FI_HMEM_CUDA || FI_VERSION_LT(api_version, FI_VERSION(1, 18))) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = true; return 0; } @@ -354,12 +339,10 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem * struct will be used to determine which efa transfer * protocol should be selected. * - * @param[in,out] efa_domain Pointer to struct efa_domain to be initialized - * * @return 0 on success * negative libfabric error code on an unexpected error */ -int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain) +int efa_hmem_info_initialize() { int ret = 0, i = 0; @@ -367,10 +350,10 @@ int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain) return -FI_ENODEV; } - memset(efa_domain->hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info)); + memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info)); EFA_HMEM_IFACE_FOREACH(i) { - efa_domain_hmem_info_init_iface(efa_domain, efa_hmem_ifaces[i]); + efa_hmem_info_init_iface(efa_hmem_ifaces[i]); } return ret; diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h index e18c0e4c534..858b7035883 100644 --- a/prov/efa/src/efa_hmem.h +++ b/prov/efa/src/efa_hmem.h @@ -23,8 +23,6 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = { struct efa_hmem_info { bool initialized; /* do we support it at all */ - bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */ - bool p2p_required_by_impl; /* Is p2p required for this interface? */ bool p2p_supported_by_device; /* do we support p2p with this device */ size_t max_medium_msg_size; @@ -33,10 +31,12 @@ struct efa_hmem_info { size_t min_read_write_size; }; +extern struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX]; + struct efa_domain; -int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt); -int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain); +int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version); +int efa_hmem_info_initialize(); /** * @brief Copy data from a hmem device to a system buffer diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index a9a37a4cd0b..1e1f803b777 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -184,12 +184,6 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, { int err; struct iovec mr_iov = {0}; - - if (flags & FI_MR_DMABUF) - ofi_mr_get_iov_from_dmabuf(&mr_iov, attr->dmabuf, 1); - else - mr_iov = *attr->mr_iov; - efa_mr->peer.flags = flags; if (attr->iface == FI_HMEM_SYSTEM) { @@ -198,7 +192,7 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, } if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM) { - if (efa_mr->domain->hmem_info[attr->iface].initialized) { + if (g_efa_hmem_info[attr->iface].initialized) { efa_mr->peer.iface = attr->iface; } else { EFA_WARN(FI_LOG_MR, @@ -227,7 +221,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, efa_mr->needs_sync = true; efa_mr->peer.device.cuda = attr->device.cuda; - if (cuda_is_gdrcopy_enabled()) { + if (!(flags & FI_MR_DMABUF) && cuda_is_gdrcopy_enabled()) { + mr_iov = *attr->mr_iov; err = ofi_hmem_dev_register(FI_HMEM_CUDA, mr_iov.iov_base, mr_iov.iov_len, (uint64_t *)&efa_mr->peer.hmem_data); efa_mr->peer.flags |= OFI_HMEM_DATA_DEV_REG_HANDLE; @@ -529,61 +524,32 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr ); /* - * TODO: remove the synapseai and neuron blocks by onboarding the - * ofi_hmem_get_dmabuf_fd API. + * When FI_MR_DMABUF flag is not set, + * only do ibv_reg_mr. + * The only exception is synapseai, + * because dmabuf is the only way + * to register Gaudi device buffer and + * it was implemented before the FI_MR_DMABUF API. */ -#if HAVE_SYNAPSEAI if (efa_mr_is_synapseai(efa_mr)) { int dmabuf_fd; uint64_t offset; int ret; - ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base, - (uint64_t) mr_attr->mr_iov->iov_len, - &dmabuf_fd, &offset); + ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_SYNAPSEAI, + mr_attr->mr_iov->iov_base, + (uint64_t) mr_attr->mr_iov->iov_len, + &dmabuf_fd, &offset); if (ret != FI_SUCCESS) { EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n"); return NULL; } - return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset, - mr_attr->mr_iov->iov_len, - (uint64_t)mr_attr->mr_iov->iov_base, - dmabuf_fd, access); - } -#endif -#if HAVE_NEURON - if (efa_mr_is_neuron(efa_mr)) { - int dmabuf_fd; - uint64_t offset; - int ret; - - ret = neuron_get_dmabuf_fd( - mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, - &dmabuf_fd, - &offset); - - if (ret == FI_SUCCESS) { - /* Success => invoke ibv_reg_dmabuf_mr */ - return efa_mr_reg_ibv_dmabuf_mr( - efa_mr->domain->ibv_pd, 0, + return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset, mr_attr->mr_iov->iov_len, (uint64_t)mr_attr->mr_iov->iov_base, dmabuf_fd, access); - } else if (ret == -FI_EOPNOTSUPP) { - /* Protocol not availabe => fallback */ - EFA_INFO(FI_LOG_MR, - "Unable to get dmabuf fd for Neuron device buffer, " - "Fall back to ibv_reg_mr\n"); - return ibv_reg_mr( - efa_mr->domain->ibv_pd, - (void *)mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, access); - } - return NULL; } -#endif return ibv_reg_mr(efa_mr->domain->ibv_pd, (void *)mr_attr->mr_iov->iov_base, @@ -847,7 +813,7 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, const void *at * For FI_HMEM_CUDA iface when p2p is unavailable, skip ibv_reg_mr() and * generate proprietary mr_fid key. */ - if (mr_attr.iface == FI_HMEM_CUDA && !efa_mr->domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) { + if (mr_attr.iface == FI_HMEM_CUDA && !g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) { efa_mr->mr_fid.key = efa_mr_cuda_non_p2p_keygen(); } else { efa_mr->ibv_mr = efa_mr_reg_ibv_mr(efa_mr, &mr_attr, fi_ibv_access, flags); diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c new file mode 100644 index 00000000000..5d5768c8ff1 --- /dev/null +++ b/prov/efa/src/efa_msg.c @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include "config.h" + + +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_iov.h" + +#include "efa.h" +#include "efa_av.h" + +#include "efa_tp.h" + +#define EFA_SETUP_IOV(iov, buf, len) \ + do { \ + iov.iov_base = (void *)buf; \ + iov.iov_len = (size_t)len; \ + } while (0) + +#define EFA_SETUP_MSG(msg, iov, _desc, count, _addr, _context, _data) \ + do { \ + msg.msg_iov = (const struct iovec *)iov; \ + msg.desc = (void **)_desc; \ + msg.iov_count = (size_t)count; \ + msg.addr = (fi_addr_t)_addr; \ + msg.context = (void *)_context; \ + msg.data = (uint32_t)_data; \ + } while (0) + +#ifndef EFA_MSG_DUMP +static inline void dump_msg(const struct fi_msg *msg, const char *context) {} +#else +#define DUMP_IOV(i, iov, desc) \ + EFA_DBG(FI_LOG_EP_DATA, \ + "\t{ iov[%d] = { base = %p, buff = \"%s\", len = %zu }, desc = %p },\n", \ + i, iov.iov_base, (char *)iov.iov_base, iov.iov_len, (desc ? desc[i] : NULL)) + +static inline void dump_msg(const struct fi_msg *msg, const char *context) +{ + int i; + + EFA_DBG(FI_LOG_EP_DATA, "%s: { data = %u, addr = %" PRIu64 ", iov_count = %zu, [\n", + context, (unsigned)msg->data, msg->addr, msg->iov_count); + for (i = 0; i < msg->iov_count; ++i) + DUMP_IOV(i, msg->msg_iov[i], msg->desc); + EFA_DBG(FI_LOG_EP_DATA, " ] }\n"); +} +#endif /* EFA_MSG_DUMP */ + +/** + * @brief post receive buffer to EFA device via ibv_post_recv + * + * @param[in] base_ep endpoint + * @param[in] msg libfabric message + * @param[in] flags libfabric flags, currently only FI_MORE is supported. + * @reutrn On Success, return 0 + * On failure, return negative libfabric error code + */ +static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_mr *efa_mr; + struct efa_qp *qp = base_ep->qp; + struct ibv_recv_wr *bad_wr; + struct ibv_recv_wr *wr; + uintptr_t addr; + ssize_t err, post_recv_err; + size_t i, wr_index; + + efa_tracepoint(recv_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + + ofi_genlock_lock(&base_ep->util_ep.lock); + wr_index = base_ep->recv_wr_index; + if (wr_index >= base_ep->info->rx_attr->size) { + EFA_INFO(FI_LOG_EP_DATA, + "recv_wr_index exceeds the rx limit, " + "recv_wr_index = %zu, rx size = %zu\n", + wr_index, base_ep->info->rx_attr->size); + err = -FI_EAGAIN; + goto out_err; + } + + memset(&base_ep->efa_recv_wr_vec[wr_index], 0, sizeof(base_ep->efa_recv_wr_vec[wr_index])); + dump_msg(msg, "recv"); + + assert(msg->iov_count <= base_ep->info->rx_attr->iov_limit); + + if (qp->ibv_qp->qp_type == IBV_QPT_UD && + OFI_UNLIKELY(msg->msg_iov[0].iov_len < + base_ep->info->ep_attr->msg_prefix_size)) { + EFA_WARN(FI_LOG_EP_DATA, + "prefix not present on first iov, " + "iov_len[%zu]\n", + msg->msg_iov[0].iov_len); + err = -EINVAL; + goto out_err; + } + + wr = &base_ep->efa_recv_wr_vec[wr_index].wr; + wr->num_sge = msg->iov_count; + wr->sg_list = base_ep->efa_recv_wr_vec[wr_index].sge; + wr->wr_id = (uintptr_t) efa_fill_context(msg->context, msg->addr, flags, + FI_RECV | FI_MSG); + + for (i = 0; i < msg->iov_count; i++) { + addr = (uintptr_t)msg->msg_iov[i].iov_base; + + /* Set RX buffer desc from SGE */ + wr->sg_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + wr->sg_list[i].lkey = efa_mr->ibv_mr->lkey; + wr->sg_list[i].addr = addr; + } + + base_ep->efa_recv_wr_vec[wr_index].wr.next = NULL; + if (wr_index > 0) + base_ep->efa_recv_wr_vec[wr_index - 1].wr.next = &base_ep->efa_recv_wr_vec[wr_index].wr; + + base_ep->recv_wr_index++; + + if (flags & FI_MORE) { + err = 0; + goto out; + } + + efa_tracepoint(post_recv, wr->wr_id, (uintptr_t)msg->context); + + err = ibv_post_recv(qp->ibv_qp, &base_ep->efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(err)) { + /* On failure, ibv_post_recv() return positive errno. + * Meanwhile, this function return a negative errno. + * So, we do the conversion here. + */ + err = (err == ENOMEM) ? -FI_EAGAIN : -err; + } + + base_ep->recv_wr_index = 0; + +out: + ofi_genlock_unlock(&base_ep->util_ep.lock); + + return err; + +out_err: + if (base_ep->recv_wr_index > 0) { + post_recv_err = ibv_post_recv(qp->ibv_qp, &base_ep->efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(post_recv_err)) { + EFA_WARN(FI_LOG_EP_DATA, + "Encountered error %ld when ibv_post_recv on error handling path\n", + post_recv_err); + } + } + + base_ep->recv_wr_index = 0; + + ofi_genlock_unlock(&base_ep->util_ep.lock); + + return err; +} + +static ssize_t efa_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + + return efa_post_recv(base_ep, msg, flags | base_ep->util_ep.rx_msg_flags); +} + +static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct iovec iov; + struct fi_msg msg; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); + + return efa_post_recv(base_ep, &msg, efa_rx_flags(base_ep)); +} + +static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + + EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); + + return efa_post_recv(base_ep, &msg, efa_rx_flags(base_ep)); +} + +static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_qp *qp = base_ep->qp; + struct efa_conn *conn; + struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */ + struct ibv_data_buf inline_data_list[2]; + size_t len, i; + int ret = 0; + + efa_tracepoint(send_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + + dump_msg(msg, "send"); + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + + assert(msg->iov_count <= base_ep->info->tx_attr->iov_limit); + + len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + + if (qp->ibv_qp->qp_type == IBV_QPT_UD) { + assert(msg->msg_iov[0].iov_len >= base_ep->info->ep_attr->msg_prefix_size); + len -= base_ep->info->ep_attr->msg_prefix_size; + } + + assert(len <= base_ep->info->ep_attr->max_msg_size); + + ofi_genlock_lock(&base_ep->util_ep.lock); + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_SEND | FI_MSG); + + if (flags & FI_REMOTE_CQ_DATA) { + ibv_wr_send_imm(qp->ibv_qp_ex, msg->data); + } else { + ibv_wr_send(qp->ibv_qp_ex); + } + + if (len <= base_ep->domain->device->efa_attr.inline_buf_size && + (!msg->desc || !efa_mr_is_hmem(msg->desc[0]))) { + for (i = 0; i < msg->iov_count; i++) { + inline_data_list[i].addr = msg->msg_iov[i].iov_base; + inline_data_list[i].length = msg->msg_iov[i].iov_len; + + /* Whole prefix must be on the first sgl for dgram */ + if (!i && qp->ibv_qp->qp_type == IBV_QPT_UD) { + inline_data_list[i].addr = (char*)inline_data_list[i].addr + base_ep->info->ep_attr->msg_prefix_size; + inline_data_list[i].length -= base_ep->info->ep_attr->msg_prefix_size; + } + } + ibv_wr_set_inline_data_list(qp->ibv_qp_ex, msg->iov_count, inline_data_list); + } else { + for (i = 0; i < msg->iov_count; i++) { + /* Set TX buffer desc from SGE */ + assert (msg->desc && msg->desc[i]); + sg_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; + sg_list[i].addr = (uintptr_t)msg->msg_iov[i].iov_base; + sg_list[i].length = msg->msg_iov[i].iov_len; + + /* Whole prefix must be on the first sgl for dgram */ + if (!i && qp->ibv_qp->qp_type == IBV_QPT_UD) { + sg_list[i].addr += base_ep->info->ep_attr->msg_prefix_size; + sg_list[i].length -= base_ep->info->ep_attr->msg_prefix_size; + } + } + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sg_list); + } + + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + + efa_tracepoint(post_send, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); + + if (!(flags & FI_MORE)) { + ret = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + + ofi_genlock_unlock(&base_ep->util_ep.lock); + return ret; +} + +static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + + return efa_post_send(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); +} + +static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); + + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep)); +} + +static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); + + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep) | FI_REMOTE_CQ_DATA); +} + +static ssize_t efa_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + + EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); + + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep)); +} + +static ssize_t efa_ep_msg_inject(struct fid_ep *ep_fid, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + + assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, 0); + + return efa_post_send(base_ep, &msg, FI_INJECT); +} + +static ssize_t efa_ep_msg_injectdata(struct fid_ep *ep_fid, const void *buf, + size_t len, uint64_t data, + fi_addr_t dest_addr) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + + assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, data); + + return efa_post_send(base_ep, &msg, FI_REMOTE_CQ_DATA | FI_INJECT); +} + +struct fi_ops_msg efa_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = efa_ep_recv, + .recvv = efa_ep_recvv, + .recvmsg = efa_ep_recvmsg, + .send = efa_ep_send, + .sendv = efa_ep_sendv, + .sendmsg = efa_ep_sendmsg, + .senddata = efa_ep_senddata, + .inject = efa_ep_msg_inject, + .injectdata = efa_ep_msg_injectdata, +}; + +struct fi_ops_msg efa_dgram_ep_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = efa_ep_recv, + .recvv = efa_ep_recvv, + .recvmsg = efa_ep_recvmsg, + .send = efa_ep_send, + .sendv = efa_ep_sendv, + .sendmsg = efa_ep_sendmsg, + .senddata = efa_ep_senddata, + .inject = fi_no_msg_inject, + .injectdata = fi_no_msg_injectdata, +}; diff --git a/prov/efa/src/efa_prov.c b/prov/efa/src/efa_prov.c index 85a71aa2c41..1f805c6742b 100644 --- a/prov/efa/src/efa_prov.c +++ b/prov/efa/src/efa_prov.c @@ -164,6 +164,15 @@ EFA_INI if (err) goto err_free; + err = efa_fork_support_enable_if_requested(); + if (err) { + goto err_free; + } + + err = efa_hmem_info_initialize(); + if (err) + goto err_free; + dlist_init(&g_efa_domain_list); return &efa_prov; diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index bddc965d53a..2f16f23816f 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -75,7 +75,7 @@ const struct fi_domain_attr efa_domain_attr = { .resource_mgmt = FI_RM_DISABLED, .mr_mode = OFI_MR_BASIC_MAP | FI_MR_LOCAL | OFI_MR_BASIC, .mr_key_size = sizeof_field(struct ibv_sge, lkey), - .cq_data_size = 0, + .cq_data_size = EFA_CQ_DATA_SIZE, .tx_ctx_cnt = 1024, .rx_ctx_cnt = 1024, .max_ep_tx_ctx = 1, @@ -145,7 +145,9 @@ const struct fi_ep_attr efa_ep_attr = { .protocol = FI_PROTO_EFA, .protocol_version = 1, .msg_prefix_size = 0, + .max_order_raw_size = 0, .max_order_war_size = 0, + .max_order_waw_size = 0, .mem_tag_format = 0, .tx_ctx_cnt = 1, .rx_ctx_cnt = 1, @@ -182,13 +184,14 @@ void efa_prov_info_set_ep_attr(struct fi_info *prov_info, * a completion, therefore there is no way for dgram endpoint * to implement FI_INJECT. Because FI_INJECT is not an optional * feature, we had to set inject_size to 0. + * + * TODO: + * Remove this after implementing cq read for efa-raw */ prov_info->tx_attr->inject_size = 0; } prov_info->ep_attr->max_msg_size = device->ibv_port_attr.max_msg_sz; - prov_info->ep_attr->max_order_raw_size = device->ibv_port_attr.max_msg_sz; - prov_info->ep_attr->max_order_waw_size = device->ibv_port_attr.max_msg_sz; } /** @@ -553,10 +556,6 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr, * buffer. EFA RDM endpoint does not have this requirement, hence unset the flag */ prov_info_rdm->domain_attr->mr_mode &= ~FI_MR_LOCAL; - - /* EFA RDM endpoint support writing CQ data by put it in packet header - */ - prov_info_rdm->domain_attr->cq_data_size = EFA_RDM_CQ_DATA_SIZE; } /* update ep_attr */ @@ -579,6 +578,8 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr, - device->rdm_info->src_addrlen - EFA_RDM_IOV_LIMIT * sizeof(struct fi_rma_iov); prov_info_rdm->ep_attr->max_order_raw_size = max_atomic_size; + prov_info_rdm->ep_attr->max_order_war_size = max_atomic_size; + prov_info_rdm->ep_attr->max_order_waw_size = max_atomic_size; } /* update tx_attr */ diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c new file mode 100644 index 00000000000..da33b44350f --- /dev/null +++ b/prov/efa/src/efa_rma.c @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include +#include +#include +#include "efa.h" +#include "efa_av.h" + +#define EFA_SETUP_IOV(iov, buf, len) \ + do { \ + iov.iov_base = (void *) buf; \ + iov.iov_len = (size_t) len; \ + } while (0) + +#define EFA_SETUP_RMA_IOV(rma_iov, _addr, _len, _key) \ + do { \ + rma_iov.addr = (uint64_t) _addr; \ + rma_iov.len = (size_t) _len; \ + rma_iov.key = (uint64_t) _key; \ + } while (0) + +#define EFA_SETUP_MSG_RMA(msg, iov, _desc, count, _addr, _rma_iov, \ + _rma_iov_count, _context, _data) \ + do { \ + msg.msg_iov = (const struct iovec *) iov; \ + msg.desc = (void **) _desc; \ + msg.iov_count = (size_t) count; \ + msg.addr = (fi_addr_t) _addr; \ + msg.rma_iov = (const struct fi_rma_iov *) _rma_iov; \ + msg.rma_iov_count = (size_t) _rma_iov_count; \ + msg.context = (void *) _context; \ + msg.data = (uint32_t) _data; \ + } while (0) + +/** + * @brief check whether endpoint was configured with FI_RMA capability + * @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was. + */ +static inline int efa_rma_check_cap(struct efa_base_ep *base_ep) { + if ((base_ep->info->caps & FI_RMA) == FI_RMA) + return 0; + EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n"); + return -FI_EOPNOTSUPP; +} + +/* + * efa_rma_post_read() will post a read request. + * + * Input: + * base_ep: endpoint + * msg: read operation information + * flags: currently no flags is taken + * + * On success return 0, + * If read failed, return the error of read operation + */ +static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_qp *qp; + struct efa_mr *efa_mr; + struct efa_conn *conn; +#ifndef _WIN32 + struct ibv_sge sge_list[msg->iov_count]; +#else + /* MSVC compiler does not support array declarations with runtime size, so hardcode + * the expected iov_limit/max_sq_sge from the lower-level efa provider. + */ + struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; +#endif + int i, err = 0; + + efa_tracepoint(read_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + + assert(msg->iov_count > 0 && + msg->iov_count <= base_ep->domain->info->tx_attr->iov_limit); + assert(msg->rma_iov_count > 0 && + msg->rma_iov_count <= base_ep->domain->info->tx_attr->rma_iov_limit); + assert(ofi_total_iov_len(msg->msg_iov, msg->iov_count) <= + base_ep->domain->device->max_rdma_size); + + qp = base_ep->qp; + + ofi_genlock_lock(&base_ep->util_ep.lock); + + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_RMA | FI_READ); + + /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ + ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); + + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + sge_list[i].lkey = efa_mr->ibv_mr->lkey; + } + + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + + efa_tracepoint(post_read, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); + + if (!(flags & FI_MORE)) { + err = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; +} + +static +ssize_t efa_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + return efa_rma_post_read(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); +} + +static +ssize_t efa_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + size_t len; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + len = ofi_total_iov_len(iov, iov_count); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, iov, desc, iov_count, src_addr, &rma_iov, 1, + context, 0); + + return efa_rma_post_read(base_ep, &msg, efa_tx_flags(base_ep)); +} + +static +ssize_t efa_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, + void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, src_addr, &rma_iov, 1, context, 0); + + return efa_rma_post_read(base_ep, &msg, efa_tx_flags(base_ep)); +} + +/** + * @brief Post a WRITE request + * + * Input: + * base_ep: endpoint + * msg: read operation information + * flags: flags passed + * @return On success return 0, otherwise return a negative libfabric error code. + */ +static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_qp *qp; + struct efa_conn *conn; +#ifndef _WIN32 + struct ibv_sge sge_list[msg->iov_count]; +#else + /* MSVC compiler does not support array declarations with runtime size, so hardcode + * the expected iov_limit/max_sq_sge from the lower-level efa provider. + */ + struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; + struct ibv_data_buf inline_data_list[EFA_DEV_ATTR_MAX_WR_SGE]; +#endif + int i, err = 0; + + if (flags & FI_INJECT) { + EFA_WARN(FI_LOG_EP_DATA, + "FI_INJECT is not supported by efa rma yet.\n"); + return -FI_ENOSYS; + } + + efa_tracepoint(write_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + + qp = base_ep->qp; + + ofi_genlock_lock(&base_ep->util_ep.lock); + + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_RMA | FI_WRITE); + + if (flags & FI_REMOTE_CQ_DATA) { + ibv_wr_rdma_write_imm(qp->ibv_qp_ex, msg->rma_iov[0].key, + msg->rma_iov[0].addr, msg->data); + } else { + ibv_wr_rdma_write(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); + } + + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + sge_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; + } + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + + efa_tracepoint(post_write, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); + + if (!(flags & FI_MORE)) { + err = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; +} + +ssize_t efa_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + return efa_rma_post_write(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); +} + +ssize_t efa_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, + void **desc, size_t iov_count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + size_t len; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + len = ofi_total_iov_len(iov, iov_count); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, iov, desc, iov_count, dest_addr, &rma_iov, 1, + context, 0); + + return efa_rma_post_write(base_ep, &msg, efa_tx_flags(base_ep)); +} + +ssize_t efa_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, dest_addr, &rma_iov, 1, context, 0); + + return efa_rma_post_write(base_ep, &msg, efa_tx_flags(base_ep)); +} + +ssize_t efa_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, dest_addr, &rma_iov, 1, context, data); + + return efa_rma_post_write(base_ep, &msg, FI_REMOTE_CQ_DATA | efa_tx_flags(base_ep)); +} + +struct fi_ops_rma efa_dgram_ep_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = fi_no_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = fi_no_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_rma efa_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = efa_rma_read, + .readv = efa_rma_readv, + .readmsg = efa_rma_readmsg, + .write = efa_rma_write, + .writev = efa_rma_writev, + .writemsg = efa_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = efa_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; diff --git a/prov/efa/src/efa_strerror.c b/prov/efa/src/efa_strerror.c index 11197816efd..895ebfd83e7 100644 --- a/prov/efa/src/efa_strerror.c +++ b/prov/efa/src/efa_strerror.c @@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) { help = "This error is detected remotely; " "typically encountered when the peer process is no longer present"; break; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: help = "This error is detected locally. " - "The connection status is unknown or was never established via " - "handshake. This typically indicates one or more misconfigured " + "The peer is not reachable by the EFA device. " + "This typically indicates one or more misconfigured " "EC2 instances; most often due to incorrect inbound/outbound " "security group rules and/or instances placed in different " "subnets. Refer to the public AWS documentation for EFA for " @@ -80,8 +80,19 @@ void efa_show_help(enum efa_errno err) { case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: help = "This error is detected locally. " "The connection was previously established via handshake, " - "which indicates the error is likely due to the peer process no " - "longer being present."; + "which indicates the error is likely due to a hardware failure " + "on the remote peer, or the peer process no longer being present."; + break; + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: + help = "This error is detected locally. " + "The peer is reachable by the EFA device but libfabric failed " + "to complete a handshake, which indicates the error is likely " + "due to the peer process no longer being present."; + break; + case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX: + help = "This error is detected locally. " + "Please consider matching the local and remote libfabric versions, or turning off " + "the zero-copy recv feature by setting FI_EFA_USE_ZCPY_RX=0 in the environment"; break; default: return; diff --git a/prov/efa/src/efa_tp.h b/prov/efa/src/efa_tp.h index ec3ce8ebc47..ce9151a8619 100644 --- a/prov/efa/src/efa_tp.h +++ b/prov/efa/src/efa_tp.h @@ -25,22 +25,40 @@ /* tracelog() is similar to tracef(), but with a log level param */ #define efa_tracelog lttng_ust_tracelog -static inline void efa_tracepoint_wr_id_post_send(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_send(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_send, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_send, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } -static inline void efa_tracepoint_wr_id_post_recv(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_recv(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_recv, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_recv, (size_t) wr_id, (size_t) ope->cq_entry.op_context); +} + +static inline void efa_rdm_tracepoint_wr_id_post_read(const void *wr_id) +{ + struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; + struct efa_rdm_ope *ope = pkt_entry->ope; + if (!ope) + return; + efa_tracepoint(post_read, (size_t) wr_id, (size_t) ope->cq_entry.op_context); +} + +static inline void efa_rdm_tracepoint_wr_id_post_write(const void *wr_id) +{ + struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; + struct efa_rdm_ope *ope = pkt_entry->ope; + if (!ope) + return; + efa_tracepoint(post_write, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } #else diff --git a/prov/efa/src/efa_tp_def.h b/prov/efa/src/efa_tp_def.h index 72e03988a56..d05dec67f27 100644 --- a/prov/efa/src/efa_tp_def.h +++ b/prov/efa/src/efa_tp_def.h @@ -18,14 +18,45 @@ #define X_PKT_ARGS \ size_t, wr_id, \ - size_t, efa_rdm_ope, \ size_t, context #define X_PKT_FIELDS \ lttng_ust_field_integer_hex(size_t, wr_id, wr_id) \ - lttng_ust_field_integer_hex(size_t, efa_rdm_ope, efa_rdm_ope) \ lttng_ust_field_integer_hex(size_t, context, context) +#define MSG_ARGS \ + size_t, msg_ctx, \ + size_t, addr + +#define MSG_FIELDS \ + lttng_ust_field_integer_hex(size_t, msg_ctx, msg_ctx) \ + lttng_ust_field_integer_hex(size_t, addr, addr) + +LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS), + LTTNG_UST_TP_FIELDS(MSG_FIELDS)) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + send_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, send_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + recv_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, recv_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + read_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, read_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + write_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, write_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + + LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, post_wr_id, LTTNG_UST_TP_ARGS(X_PKT_ARGS), LTTNG_UST_TP_FIELDS(X_PKT_FIELDS)) @@ -40,6 +71,16 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, LTTNG_UST_TP_ARGS(X_PKT_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_recv, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, + post_read, + LTTNG_UST_TP_ARGS(X_PKT_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_read, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, + post_write, + LTTNG_UST_TP_ARGS(X_PKT_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_write, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + #endif /* _EFA_TP_DEF_H */ #include diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 919a1cacb97..e152f2adc23 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -361,8 +361,6 @@ bool efa_user_info_should_support_hmem(int version) static int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_info *hints) { - uint64_t atomic_ordering; - if (hints && (hints->caps & FI_HMEM)) { /* * FI_HMEM is a primary capability, therefore only check @@ -418,11 +416,14 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i * the default message order supported by the provider is returned. */ info->tx_attr->msg_order &= hints->tx_attr->msg_order; - atomic_ordering = FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | - FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; - if (!(hints->tx_attr->msg_order & atomic_ordering)) { + + /* If no atomic ordering is requested, set the max_order_*_size as 0 */ + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_RAW)) info->ep_attr->max_order_raw_size = 0; - } + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAR)) + info->ep_attr->max_order_war_size = 0; + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAW)) + info->ep_attr->max_order_waw_size = 0; } if (hints->rx_attr) { diff --git a/prov/efa/src/fi_ext_efa.h b/prov/efa/src/fi_ext_efa.h index 9d3c41575a4..a4d3465e455 100644 --- a/prov/efa/src/fi_ext_efa.h +++ b/prov/efa/src/fi_ext_efa.h @@ -4,6 +4,8 @@ #ifndef _FI_EXT_EFA_H_ #define _FI_EXT_EFA_H_ +#include + #define FI_EFA_DOMAIN_OPS "efa domain ops" struct fi_efa_mr_attr { diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index eb997a77906..961e9f0695e 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -148,7 +148,7 @@ ssize_t efa_rdm_atomic_generic_efa(struct efa_rdm_ep *efa_rdm_ep, ssize_t err; struct util_srx_ctx *srx_ctx; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(efa_rdm_ep, perf_efa_tx); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 3d34293e7e7..24051cc2e8a 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -36,16 +36,16 @@ int efa_rdm_cq_close(struct fid *fid) retv = 0; - cq = container_of(fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - if (cq->ibv_cq.ibv_cq_ex) { - ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (cq->efa_cq.ibv_cq.ibv_cq_ex) { + ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->efa_cq.ibv_cq.ibv_cq_ex)); if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", fi_strerror(-ret)); return ret; } - cq->ibv_cq.ibv_cq_ex = NULL; + cq->efa_cq.ibv_cq.ibv_cq_ex = NULL; } if (cq->shm_cq) { @@ -56,7 +56,7 @@ int efa_rdm_cq_close(struct fid *fid) } } - ret = ofi_cq_cleanup(&cq->util_cq); + ret = ofi_cq_cleanup(&cq->efa_cq.util_cq); if (ret) return ret; free(cq); @@ -72,29 +72,6 @@ static struct fi_ops efa_rdm_cq_fi_ops = { }; -#if HAVE_CAPS_UNSOLICITED_WRITE_RECV -/** - * @brief Check whether a completion consumes recv buffer - * - * @param ibv_cq_ex extended ibv cq - * @return true the wc consumes a recv buffer - * @return false the wc doesn't consume a recv buffer - */ -static inline -bool efa_rdm_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) -{ - return efa_rdm_use_unsolicited_write_recv() && efadv_wc_is_unsolicited(efadv_cq_from_ibv_cq_ex(ibv_cq_ex)); -} - -#else - -static inline -bool efa_rdm_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) -{ - return false; -} - -#endif /** * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM * @@ -139,7 +116,7 @@ void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( EFA_WARN(FI_LOG_CQ, "Unable to write a cq entry for remote for RECV_RDMA operation: %s\n", fi_strerror(-ret)); - efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_SHM_CQ_ENTRY); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_RECV_COMP); } efa_cntr_report_rx_completion(&ep->base_ep.util_ep, flags); @@ -148,7 +125,7 @@ void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( * For unsolicited wc, pkt_entry can be NULL, so we can only * access it for solicited wc. */ - if (!efa_rdm_cq_wc_is_unsolicited(ibv_cq_ex)) { + if (!efa_cq_wc_is_unsolicited(ibv_cq_ex)) { /** * Recv with immediate will consume a pkt_entry, but the pkt is not * filled, so free the pkt_entry and record we have one less posted @@ -371,12 +348,13 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * QP and we cannot cancel that. */ if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) { - EFA_WARN(FI_LOG_CQ, - "Invalid pkt type %d! Peer %d doesn't respect the request from this EP that" - " RTM packets must be sent to the user recv QP.\n", - base_hdr->type, (int)pkt_entry->addr); + char errbuf[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; + size_t errbuf_len; - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE); + /* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */ + if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, errbuf, &errbuf_len)) + EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -398,7 +376,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * * @todo Currently, this only checks for unresponsive receiver * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to - * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other + * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or + * #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed. + * This should be expanded to handle other * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate * error reporting */ @@ -417,8 +397,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) { switch (vendor_err) { case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { - if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) - vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; + vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ? + FI_EFA_ERR_ESTABLISHED_RECV_UNRESP : + FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP; break; } default: @@ -454,13 +435,13 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) int prov_errno; struct efa_rdm_ep *ep = NULL; struct fi_cq_err_entry err_entry; - struct efa_rdm_cq *efa_rdm_cq; + struct efa_cq *efa_cq; struct efa_domain *efa_domain; struct efa_qp *qp; struct dlist_entry rx_progressed_ep_list, *tmp; - efa_rdm_cq = container_of(ibv_cq, struct efa_rdm_cq, ibv_cq); - efa_domain = container_of(efa_rdm_cq->util_cq.domain, struct efa_domain, util_domain); + efa_cq = container_of(ibv_cq, struct efa_cq, ibv_cq); + efa_domain = container_of(efa_cq->util_cq.domain, struct efa_domain, util_domain); dlist_init(&rx_progressed_ep_list); /* Call ibv_start_poll only once */ @@ -471,7 +452,14 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) pkt_entry = (void *)(uintptr_t)ibv_cq->ibv_cq_ex->wr_id; qp = efa_domain->qp_table[ibv_wc_read_qp_num(ibv_cq->ibv_cq_ex) & efa_domain->qp_table_sz_m1]; ep = container_of(qp->base_ep, struct efa_rdm_ep, base_ep); +#if HAVE_LTTNG efa_rdm_tracepoint(poll_cq, (size_t) ibv_cq->ibv_cq_ex->wr_id); + if (pkt_entry && pkt_entry->ope) + efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, + (size_t) pkt_entry->ope->cq_entry.op_context, + pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, + pkt_entry->ope->addr); +#endif opcode = ibv_wc_read_opcode(ibv_cq->ibv_cq_ex); if (ibv_cq->ibv_cq_ex->status) { prov_errno = efa_rdm_cq_get_prov_errno(ibv_cq->ibv_cq_ex); @@ -483,6 +471,12 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) break; case IBV_WC_RECV: /* fall through */ case IBV_WC_RECV_RDMA_WITH_IMM: + if (efa_cq_wc_is_unsolicited(ibv_cq->ibv_cq_ex)) { + EFA_WARN(FI_LOG_CQ, "Receive error %s (%d) for unsolicited write recv", + efa_strerror(prov_errno), prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, to_fi_errno(prov_errno), prov_errno); + break; + } efa_rdm_pke_handle_rx_error(pkt_entry, prov_errno); break; default: @@ -544,7 +538,7 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) .prov_errno = prov_errno, .op_context = NULL }; - ofi_cq_write_error(&efa_rdm_cq->util_cq, &err_entry); + ofi_cq_write_error(&efa_cq->util_cq, &err_entry); } if (should_end_poll) @@ -565,9 +559,9 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun ssize_t ret; struct efa_domain *domain; - cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(cq_fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + domain = container_of(cq->efa_cq.util_cq.domain, struct efa_domain, util_domain); ofi_genlock_lock(&domain->srx_lock); @@ -579,13 +573,13 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun * completion to efa. Use ofi_cq_read_entries to get the number of * shm completions without progressing efa ep again. */ - ret = ofi_cq_read_entries(&cq->util_cq, buf, count, src_addr); + ret = ofi_cq_read_entries(&cq->efa_cq.util_cq, buf, count, src_addr); if (ret > 0) goto out; } - ret = ofi_cq_readfrom(&cq->util_cq.cq_fid, buf, count, src_addr); + ret = ofi_cq_readfrom(&cq->efa_cq.util_cq.cq_fid, buf, count, src_addr); out: ofi_genlock_unlock(&domain->srx_lock); @@ -614,8 +608,8 @@ static void efa_rdm_cq_progress(struct util_cq *cq) struct fid_list_entry *fid_entry; ofi_genlock_lock(&cq->ep_list_lock); - efa_rdm_cq = container_of(cq, struct efa_rdm_cq, util_cq); - efa_domain = container_of(efa_rdm_cq->util_cq.domain, struct efa_domain, util_domain); + efa_rdm_cq = container_of(cq, struct efa_rdm_cq, efa_cq.util_cq); + efa_domain = container_of(efa_rdm_cq->efa_cq.util_cq.domain, struct efa_domain, util_domain); /** * TODO: It's better to just post the initial batch of internal rx pkts during ep enable @@ -624,13 +618,13 @@ static void efa_rdm_cq_progress(struct util_cq *cq) * some idle endpoints and never poll completions for them. Move these initial posts to * the first cq read call before having a long term fix. */ - if (!efa_rdm_cq->initial_rx_to_all_eps_posted) { + if (efa_rdm_cq->need_to_scan_ep_list) { dlist_foreach(&cq->ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); efa_rdm_ep = container_of(fid_entry->fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); } - efa_rdm_cq->initial_rx_to_all_eps_posted = true; + efa_rdm_cq->need_to_scan_ep_list = false; } dlist_foreach(&efa_rdm_cq->ibv_cq_poll_list, item) { @@ -676,20 +670,20 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, attr->size = MAX(efa_domain->rdm_cq_size, attr->size); dlist_init(&cq->ibv_cq_poll_list); - cq->initial_rx_to_all_eps_posted = false; - ret = ofi_cq_init(&efa_prov, domain, attr, &cq->util_cq, + cq->need_to_scan_ep_list = false; + ret = ofi_cq_init(&efa_prov, domain, attr, &cq->efa_cq.util_cq, &efa_rdm_cq_progress, context); if (ret) goto free; - ret = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, &cq->ibv_cq.ibv_cq_ex, &cq->ibv_cq.ibv_cq_ex_type); + ret = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, &cq->efa_cq.ibv_cq.ibv_cq_ex, &cq->efa_cq.ibv_cq.ibv_cq_ex_type); if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", fi_strerror(ret)); goto close_util_cq; } - *cq_fid = &cq->util_cq.cq_fid; + *cq_fid = &cq->efa_cq.util_cq.cq_fid; (*cq_fid)->fid.ops = &efa_rdm_cq_fi_ops; (*cq_fid)->ops = &efa_rdm_cq_ops; @@ -699,7 +693,7 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, /* Bind ep with shm provider's cq */ shm_cq_attr.flags |= FI_PEER; peer_cq_context.size = sizeof(peer_cq_context); - peer_cq_context.cq = cq->util_cq.peer_cq; + peer_cq_context.cq = cq->efa_cq.util_cq.peer_cq; ret = fi_cq_open(efa_domain->shm_domain, &shm_cq_attr, &cq->shm_cq, &peer_cq_context); if (ret) { @@ -710,12 +704,12 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, return 0; destroy_ibv_cq: - retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->efa_cq.ibv_cq.ibv_cq_ex)); if (retv) EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", fi_strerror(-retv)); close_util_cq: - retv = ofi_cq_cleanup(&cq->util_cq); + retv = ofi_cq_cleanup(&cq->efa_cq.util_cq); if (retv) EFA_WARN(FI_LOG_CQ, "Unable to close util cq: %s\n", fi_strerror(-retv)); diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index 5bb7b2b80c0..e1a865ee127 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -8,18 +8,12 @@ #include struct efa_rdm_cq { - struct util_cq util_cq; + struct efa_cq efa_cq; struct fid_cq *shm_cq; - struct efa_ibv_cq ibv_cq; struct dlist_entry ibv_cq_poll_list; - bool initial_rx_to_all_eps_posted; + bool need_to_scan_ep_list; }; -/* - * Control header with completion data. CQ data length is static. - */ -#define EFA_RDM_CQ_DATA_SIZE (4) - int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 21b8f271647..d5f2e76d8ce 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -10,7 +10,6 @@ #include "efa_base_ep.h" #include "efa_rdm_rxe_map.h" -#define EFA_RDM_ERROR_MSG_BUFFER_LENGTH 1024 /** @brief Information of a queued copy. * @@ -54,18 +53,16 @@ struct efa_rdm_ep { /* shm provider fid */ struct fid_ep *shm_ep; + /* shm srx fid (shm-owned) */ + struct fid_ep *shm_srx; + /* shm peer_srx (efa-owned) */ + struct fid_peer_srx *shm_peer_srx; - /* - * EFA RDM endpoint rx/tx queue sizes. These may be different from the core - * provider's rx/tx size and will either limit the number of possible - * receives/sends or allow queueing. - */ - size_t rx_size; - size_t tx_size; size_t mtu_size; - size_t rx_iov_limit; - size_t tx_iov_limit; - size_t inject_size; + size_t max_tagged_size; /**< #FI_OPT_MAX_TAGGED_SIZE */ + size_t max_atomic_size; /**< #FI_OPT_MAX_ATOMIC_SIZE */ + size_t inject_tagged_size; /**< #FI_OPT_INJECT_TAGGED_SIZE */ + size_t inject_atomic_size; /**< #FI_OPT_INJECT_ATOMIC_SIZE */ /* Endpoint's capability to support zero-copy rx */ bool use_zcpy_rx; @@ -82,21 +79,10 @@ struct efa_rdm_ep { /* Resource management flag */ uint64_t rm_full; - /* Application's maximum msg size hint */ - size_t max_msg_size; - - /** Application's maximum RMA size */ - size_t max_rma_size; /* Applicaiton's message prefix size. */ size_t msg_prefix_size; - /* EFA RDM protocol's max header size */ - size_t max_proto_hdr_size; - - /* tx iov limit of EFA device */ - size_t efa_device_iov_limit; - /* threshold to release multi_recv buffer */ size_t min_multi_recv_size; @@ -193,10 +179,8 @@ struct efa_rdm_ep { */ bool use_device_rdma; - struct fi_info *user_info; /**< fi_info passed by user when calling fi_endpoint */ bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */ bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */ - char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */ struct efa_rdm_pke **pke_vec; struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ @@ -210,12 +194,6 @@ int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep); struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep); -const char *efa_rdm_ep_raw_addr_str(struct efa_rdm_ep *ep, char *buf, size_t *buflen); - -struct efa_ep_addr *efa_rdm_ep_get_peer_raw_addr(struct efa_rdm_ep *ep, fi_addr_t addr); - -const char *efa_rdm_ep_get_peer_raw_addr_str(struct efa_rdm_ep *ep, fi_addr_t addr, char *buf, size_t *buflen); - struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr); int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr); @@ -236,17 +214,17 @@ void efa_rdm_ep_record_tx_op_completed(struct efa_rdm_ep *ep, struct efa_rdm_pke static inline size_t efa_rdm_ep_get_rx_pool_size(struct efa_rdm_ep *ep) { - return MIN(ep->efa_max_outstanding_rx_ops, ep->rx_size); + return MIN(ep->efa_max_outstanding_rx_ops, ep->base_ep.info->rx_attr->size); } static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep) { - return MIN(ep->efa_max_outstanding_tx_ops, ep->tx_size); + return MIN(ep->efa_max_outstanding_tx_ops, ep->base_ep.info->tx_attr->size); } static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep) { - return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS)); + return ((ep->base_ep.info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->base_ep.info->rx_attr->msg_order & FI_ORDER_SAS)); } @@ -277,6 +255,8 @@ struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep); +int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep); + /** * @brief return whether this endpoint should write error cq entry for RNR. * @@ -294,7 +274,7 @@ void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep); static inline bool efa_rdm_ep_should_write_rnr_completion(struct efa_rdm_ep *ep) { - return (efa_env.rnr_retry < EFA_RNR_INFINITE_RETRY) && + return (ep->base_ep.rnr_retry < EFA_RNR_INFINITE_RETRY) && (ep->handle_resource_management == FI_RM_DISABLED); } @@ -310,16 +290,14 @@ bool efa_rdm_ep_should_write_rnr_completion(struct efa_rdm_ep *ep) static inline int efa_rdm_ep_use_p2p(struct efa_rdm_ep *efa_rdm_ep, struct efa_mr *efa_mr) { - if (!efa_mr) - return 0; - /* - * always send from host buffers if we have a descriptor + * P2P is always available for host memory (Unregistered buffer will be + * regarded as host memory as EFA provider requires FI_MR_HMEM) */ - if (efa_mr->peer.iface == FI_HMEM_SYSTEM) + if (!efa_mr || efa_mr->peer.iface == FI_HMEM_SYSTEM) return 1; - if (efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[efa_mr->peer.iface].p2p_supported_by_device) + if (g_efa_hmem_info[efa_mr->peer.iface].p2p_supported_by_device) return (efa_rdm_ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED); if (efa_rdm_ep->hmem_p2p_opt == FI_HMEM_P2P_REQUIRED) { @@ -371,7 +349,7 @@ bool efa_rdm_ep_support_rdma_write(struct efa_rdm_ep *ep) * @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was. */ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) { - if ((ep->user_info->caps & FI_RMA) == FI_RMA) + if ((ep->base_ep.info->caps & FI_RMA) == FI_RMA) return 0; EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n"); return -FI_EOPNOTSUPP; @@ -382,7 +360,7 @@ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) { * @return -FI_EOPNOTSUPP if FI_ATOMIC wasn't requested, 0 if it was. */ static inline int efa_rdm_ep_cap_check_atomic(struct efa_rdm_ep *ep) { - if ((ep->user_info->caps & FI_ATOMIC) == FI_ATOMIC) + if ((ep->base_ep.info->caps & FI_ATOMIC) == FI_ATOMIC) return 0; EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_ATOMIC capability, which was not requested.\n"); return -FI_EOPNOTSUPP; @@ -462,4 +440,10 @@ static inline int efa_rdm_attempt_to_sync_memops_ioc(struct efa_rdm_ep *ep, stru return err; } +static inline +bool efa_rdm_ep_support_unsolicited_write_recv(struct efa_rdm_ep *ep) +{ + return ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; +} + #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 43241795ad2..ba0c6940f3d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -14,106 +14,17 @@ #include "efa_rdm_pke_req.h" #include "efa_cntr.h" -static -void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, - struct ibv_qp_init_attr_ex *attr_ex, - struct ibv_cq_ex *tx_cq, - struct ibv_cq_ex *rx_cq) -{ - attr_ex->cap.max_send_wr = ep->base_ep.domain->device->rdm_info->tx_attr->size; - attr_ex->cap.max_send_sge = ep->base_ep.domain->device->rdm_info->tx_attr->iov_limit; - attr_ex->cap.max_recv_wr = ep->base_ep.domain->device->rdm_info->rx_attr->size; - attr_ex->cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; - attr_ex->cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; - attr_ex->qp_type = IBV_QPT_DRIVER; - attr_ex->comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; - attr_ex->send_ops_flags = IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM; - if (efa_device_support_rdma_read()) - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; - if (efa_device_support_rdma_write()) { - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; - } - attr_ex->pd = efa_rdm_ep_domain(ep)->ibv_pd; - attr_ex->qp_context = ep; - attr_ex->sq_sig_all = 1; - - attr_ex->send_cq = ibv_cq_ex_to_cq(tx_cq); - attr_ex->recv_cq = ibv_cq_ex_to_cq(rx_cq); -} static inline struct efa_rdm_cq *efa_rdm_ep_get_tx_rdm_cq(struct efa_rdm_ep *ep) { - return ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq) : NULL; + return ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq) : NULL; } static inline struct efa_rdm_cq *efa_rdm_ep_get_rx_rdm_cq(struct efa_rdm_ep *ep) { - return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq) : NULL; -} - -/** - * @brief set the "efa_qp" field in the efa_rdm_ep->efa_base_ep - * called by efa_rdm_ep_open() - * - * @param[in,out] ep The EFA RDM endpoint to set the qp in - * @return int 0 on success, negative libfabric error code otherwise - * @todo merge this function with #efa_base_ep_construct - */ -static -int efa_rdm_ep_create_base_ep_ibv_qp(struct efa_rdm_ep *ep) -{ - struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct efa_rdm_cq *tx_rdm_cq, *rx_rdm_cq; - struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; - int ret; - - tx_rdm_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_rdm_cq = efa_rdm_ep_get_rx_rdm_cq(ep); - - if (!tx_rdm_cq && !rx_rdm_cq) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send or receive completion queue\n"); - return -FI_ENOCQ; - } - - if (!tx_rdm_cq && ofi_needs_tx(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); - return -FI_ENOCQ; - } - - if (!rx_rdm_cq && ofi_needs_rx(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled (FI_RECV).\n"); - return -FI_ENOCQ; - } - - tx_ibv_cq = tx_rdm_cq ? tx_rdm_cq->ibv_cq.ibv_cq_ex : rx_rdm_cq->ibv_cq.ibv_cq_ex; - rx_ibv_cq = rx_rdm_cq ? rx_rdm_cq->ibv_cq.ibv_cq_ex : tx_rdm_cq->ibv_cq.ibv_cq_ex; - - efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); - - ret = efa_base_ep_create_qp(&ep->base_ep, &attr_ex); - if (ret) - return ret; - - /** - * Create separate user_recv_qp to receive pkts that carries user data - * without any headers. - */ - if (ep->use_zcpy_rx) { - ret = efa_qp_create(&ep->base_ep.user_recv_qp, &attr_ex); - if (ret) { - efa_base_ep_destruct_qp(&ep->base_ep); - return ret; - } - ep->base_ep.user_recv_qp->base_ep = &ep->base_ep; - } - - return FI_SUCCESS; + return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, efa_cq.util_cq) : NULL; } static @@ -236,7 +147,9 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) ret = ofi_bufpool_create(&ep->user_rx_pkt_pool, sizeof(struct efa_rdm_pke), EFA_RDM_BUFPOOL_ALIGNMENT, - 0,ep->rx_size,0); + ep->base_ep.info->rx_attr->size, + ep->base_ep.info->rx_attr->size, /* max count==chunk_cnt means pool is not allowed to grow */ + 0); if (ret) goto err_free; @@ -285,7 +198,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_rxe_map_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->rx_size, 0); + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -301,7 +214,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_ope), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->tx_size + ep->rx_size, 0); + ep->base_ep.info->tx_attr->size + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -309,7 +222,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_peer_overflow_pke_list_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->rx_size, 0); + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -434,7 +347,6 @@ static inline void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) { enum fi_hmem_iface iface; - struct efa_hmem_info *hmem_info; uint64_t unsupported_caps = FI_DIRECTED_RECV | FI_TAGGED | FI_ATOMIC; ep->use_zcpy_rx = true; @@ -454,9 +366,12 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Max msg size is too large, turn off zcpy recv */ - if (ep->max_msg_size > ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size) { - EFA_INFO(FI_LOG_EP_CTRL, "max_msg_size (%zu) is greater than the mtu size limit: %zu. Zero-copy receive protocol will be disabled.\n", - ep->max_msg_size, ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size); + if (ep->base_ep.max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) { + EFA_INFO(FI_LOG_EP_CTRL, + "max_msg_size (%zu) is greater than the mtu size limit: %zu. " + "Zero-copy receive protocol will be disabled.\n", + ep->base_ep.max_msg_size, + ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size); ep->use_zcpy_rx = false; goto out; } @@ -482,11 +397,11 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Zero-copy receive requires P2P support. Disable it if any initialized HMEM iface does not support P2P. */ - for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; ++iface) { - hmem_info = &ep->base_ep.domain->hmem_info[iface]; - if (hmem_info->initialized && - !hmem_info->p2p_disabled_by_user && - !hmem_info->p2p_supported_by_device) { + EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) { + if (g_efa_hmem_info[iface].initialized && + (ofi_hmem_p2p_disabled() || + ep->hmem_p2p_opt == FI_HMEM_P2P_DISABLED || + !g_efa_hmem_info[iface].p2p_supported_by_device)) { EFA_INFO(FI_LOG_EP_CTRL, "%s does not support P2P, zero-copy receive " "protocol will be disabled\n", @@ -530,6 +445,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, struct efa_domain *efa_domain = NULL; struct efa_rdm_ep *efa_rdm_ep = NULL; int ret, retv, i; + enum fi_hmem_iface iface; efa_rdm_ep = calloc(1, sizeof(*efa_rdm_ep)); if (!efa_rdm_ep) @@ -552,35 +468,45 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->shm_ep = NULL; } - efa_rdm_ep->user_info = fi_dupinfo(info); - if (!efa_rdm_ep->user_info) { - ret = -FI_ENOMEM; - goto err_free_ep; - } - efa_rdm_ep->host_id = efa_get_host_id(efa_env.host_id_file); if (efa_rdm_ep->host_id) { EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); } - efa_rdm_ep->rx_size = info->rx_attr->size; - efa_rdm_ep->tx_size = info->tx_attr->size; - efa_rdm_ep->rx_iov_limit = info->rx_attr->iov_limit; - efa_rdm_ep->tx_iov_limit = info->tx_attr->iov_limit; - efa_rdm_ep->inject_size = info->tx_attr->inject_size; + /** + * These fields are set as efa device's default limit in base_ep + * Override the them to the values that are supported by efa-rdm. + * The info->ep_attr->max_msg_size is UINT64_MAX for efa-rdm because + * it supports segmentation of a large message into small pieces that + * fit into the device limit. The info->tx_attr->inject_size is currently + * the MIN(efa_mtu_size - max_hdr_size, shm_inject_size) + * as it supports emulated injection by copying user tx buffer into + * internal bounce buffer. + */ + efa_rdm_ep->base_ep.max_msg_size = info->ep_attr->max_msg_size; + efa_rdm_ep->base_ep.max_rma_size = info->ep_attr->max_msg_size; + efa_rdm_ep->base_ep.inject_msg_size = info->tx_attr->inject_size; + efa_rdm_ep->base_ep.inject_rma_size = info->tx_attr->inject_size; + /* + * base ep is configured as infinite retry, use a different default + * for efa_rdm_ep to allow libfabric level retry. + */ + efa_rdm_ep->base_ep.rnr_retry = EFA_RDM_DEFAULT_RNR_RETRY; + + /* efa_rdm_ep's own fields */ + efa_rdm_ep->max_tagged_size = info->ep_attr->max_msg_size; + efa_rdm_ep->max_atomic_size = info->ep_attr->max_msg_size; + efa_rdm_ep->inject_tagged_size = info->tx_attr->inject_size; + efa_rdm_ep->inject_atomic_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; - efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); efa_rdm_ep->shm_permitted = true; - efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; - efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size; - efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size(); efa_rdm_ep->mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size; efa_rdm_ep->max_data_payload_size = efa_rdm_ep->mtu_size - sizeof(struct efa_rdm_ctsdata_hdr) - sizeof(struct efa_rdm_ctsdata_opt_connid_hdr); - efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_ep->max_proto_hdr_size; + efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_pkt_type_get_max_hdr_size(); if (efa_env.tx_queue_size > 0 && efa_env.tx_queue_size < efa_rdm_ep->efa_max_outstanding_tx_ops) @@ -613,6 +539,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep_init_linked_lists(efa_rdm_ep); + efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18))); /* Set hmem_p2p_opt */ efa_rdm_ep->hmem_p2p_opt = FI_HMEM_P2P_DISABLED; @@ -622,16 +549,21 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, * tighter requirements for the default p2p opt */ EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) { - if (efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].initialized && - efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_supported_by_device) { - efa_rdm_ep->hmem_p2p_opt = efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_required_by_impl - ? FI_HMEM_P2P_REQUIRED - : FI_HMEM_P2P_PREFERRED; + iface = efa_hmem_ifaces[i]; + if (g_efa_hmem_info[iface].initialized && + g_efa_hmem_info[iface].p2p_supported_by_device) { + /* If user is using libfabric API 1.18 or later, by default EFA + * provider is permitted to use CUDA library to support CUDA + * memory, therefore p2p is not required. + */ + efa_rdm_ep->hmem_p2p_opt = + (iface == FI_HMEM_CUDA && efa_rdm_ep->cuda_api_permitted) ? + FI_HMEM_P2P_PREFERRED : + FI_HMEM_P2P_REQUIRED; break; } } - efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18))); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = false; efa_rdm_ep->write_in_order_aligned_128_bytes = false; @@ -706,9 +638,9 @@ static int efa_rdm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) } break; case FI_CLASS_CQ: - cq = container_of(bfid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(bfid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - ret = ofi_ep_bind_cq(&efa_rdm_ep->base_ep.util_ep, &cq->util_cq, flags); + ret = ofi_ep_bind_cq(&efa_rdm_ep->base_ep.util_ep, &cq->efa_cq.util_cq, flags); if (ret) return ret; @@ -786,7 +718,7 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) dlist_foreach_safe(&efa_rdm_ep->rxe_list, entry, tmp) { rxe = container_of(entry, struct efa_rdm_ope, ep_entry); - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Closing ep with unreleased rxe\n"); efa_rdm_rxe_release(rxe); } @@ -880,12 +812,12 @@ bool efa_rdm_ep_has_unfinished_send(struct efa_rdm_ep *efa_rdm_ep) static inline void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep) { - struct efa_rdm_cq *tx_cq, *rx_cq; + struct efa_cq *tx_cq, *rx_cq; ofi_genlock_lock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock); - tx_cq = efa_rdm_ep_get_tx_rdm_cq(efa_rdm_ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(efa_rdm_ep); + tx_cq = efa_base_ep_get_tx_cq(&efa_rdm_ep->base_ep); + rx_cq = efa_base_ep_get_rx_cq(&efa_rdm_ep->base_ep); while (efa_rdm_ep_has_unfinished_send(efa_rdm_ep)) { /* poll cq until empty */ @@ -899,30 +831,6 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep) ofi_genlock_unlock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock); } -static inline -void efa_rdm_ep_remove_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) -{ - int i; - struct efa_cntr *efa_cntr; - struct util_cntr *util_cntr; - struct efa_rdm_cq *tx_cq, *rx_cq; - - tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); - - for (i = 0; i< CNTR_CNT; i++) { - util_cntr = ep->base_ep.util_ep.cntrs[i]; - if (util_cntr) { - efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); - if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) - efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); - - if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) - efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); - } - } -} - static inline void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) { @@ -935,17 +843,53 @@ void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) * It must happen after ofi_endpoint_close * so we have cq's reference counters updated. */ - if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) { - efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + if (tx_cq && !ofi_atomic_get32(&tx_cq->efa_cq.util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (rx_cq) - efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); } - if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) { - efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + if (rx_cq && !ofi_atomic_get32(&rx_cq->efa_cq.util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (tx_cq) - efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); + } +} + +/** + * @brief Clean efa_rdm_ep's shm ep level resources as the best effort + * + * @param efa_rdm_ep pointer to efa rdm ep + * @return int FI_SUCCESS on success, negative integer on failure + */ +static int efa_rdm_ep_close_shm_ep_resources(struct efa_rdm_ep *efa_rdm_ep) +{ + int ret, retv = 0; + + if (efa_rdm_ep->shm_srx) { + ret = fi_close(&efa_rdm_ep->shm_srx->fid); + if (ret) { + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm srx\n"); + retv = ret; + } + efa_rdm_ep->shm_srx = NULL; + } + + if (efa_rdm_ep->shm_peer_srx) { + free(efa_rdm_ep->shm_peer_srx); + efa_rdm_ep->shm_peer_srx = NULL; } + + if (efa_rdm_ep->shm_ep) { + ret = fi_close(&efa_rdm_ep->shm_ep->fid); + if (ret) { + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n"); + retv = ret; + } + efa_rdm_ep->shm_ep = NULL; + } + + return retv; } /** @@ -978,7 +922,7 @@ static int efa_rdm_ep_close(struct fid *fid) * with other threads progressing the cq. */ efa_base_ep_close_util_ep(&efa_rdm_ep->base_ep); - efa_rdm_ep_remove_cntr_ibv_cq_poll_list(efa_rdm_ep); + efa_base_ep_remove_cntr_ibv_cq_poll_list(&efa_rdm_ep->base_ep); efa_rdm_ep_remove_cq_ibv_cq_poll_list(efa_rdm_ep); @@ -988,22 +932,15 @@ static int efa_rdm_ep_close(struct fid *fid) retv = ret; } - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - if (ret) { - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm EP\n"); - retv = ret; - } - } + ret = efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep); + if (ret) + retv = ret; efa_rdm_ep_destroy_buffer_pools(efa_rdm_ep); if (efa_rdm_ep->pke_vec) free(efa_rdm_ep->pke_vec); - if (efa_rdm_ep->user_info) - fi_freeinfo(efa_rdm_ep->user_info); - free(efa_rdm_ep); return retv; } @@ -1032,6 +969,9 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; + if (efa_use_unsolicited_write_recv()) + ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + if (ep->use_zcpy_rx) { /* * When zcpy rx is enabled, an extra QP is created to @@ -1063,12 +1003,8 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) struct efa_av *efa_av; struct efa_rdm_cq *efa_rdm_cq; - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - if (ret) - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n"); - efa_rdm_ep->shm_ep = NULL; - } + + (void) efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep); efa_av = efa_rdm_ep->base_ep.av; if (efa_av->shm_rdm_av) { @@ -1078,7 +1014,7 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) efa_av->shm_rdm_av = NULL; } - efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq); + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq); if (efa_rdm_cq->shm_cq) { ret = fi_close(&efa_rdm_cq->shm_cq->fid); if (ret) @@ -1086,7 +1022,7 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) efa_rdm_cq->shm_cq = NULL; } - efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq); + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, efa_cq.util_cq); if (efa_rdm_cq->shm_cq) { ret = fi_close(&efa_rdm_cq->shm_cq->fid); if (ret) @@ -1139,7 +1075,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) use_shm = true; - assert(ep->user_info); + assert(ep->base_ep.info); /* * shm provider must make cuda calls to transfer cuda memory. @@ -1149,7 +1085,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) * AWS Neuron and Habana Synapse, have no SHM provider * support anyways, so disabling SHM will not impact them. */ - if (((ep->user_info->caps & FI_HMEM) + if (((ep->base_ep.info->caps & FI_HMEM) && hmem_ops[FI_HMEM_CUDA].initialized && !ep->cuda_api_permitted) || !ep->shm_permitted) { @@ -1160,36 +1096,6 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) efa_rdm_ep_close_shm_resources(ep); } -static inline -int efa_rdm_ep_insert_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) -{ - int i, ret; - struct efa_cntr *efa_cntr; - struct util_cntr *util_cntr; - struct efa_rdm_cq *tx_cq, *rx_cq; - tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); - - for (i = 0; i < CNTR_CNT; i++) { - util_cntr = ep->base_ep.util_ep.cntrs[i]; - if (util_cntr) { - efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); - if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); - if (ret) - return ret; - } - if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); - if (ret) - return ret; - } - } - } - - return FI_SUCCESS; -} - static inline int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) { @@ -1200,27 +1106,33 @@ int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (ret) return ret; if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (ret) return ret; } + ofi_genlock_lock(&tx_cq->efa_cq.util_cq.ep_list_lock); + tx_cq->need_to_scan_ep_list = true; + ofi_genlock_unlock(&tx_cq->efa_cq.util_cq.ep_list_lock); } if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (ret) return ret; if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (ret) return ret; } + ofi_genlock_lock(&rx_cq->efa_cq.util_cq.ep_list_lock); + rx_cq->need_to_scan_ep_list = true; + ofi_genlock_unlock(&rx_cq->efa_cq.util_cq.ep_list_lock); } return FI_SUCCESS; @@ -1240,8 +1152,8 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) int ret = 0; struct fi_peer_srx_context peer_srx_context = {0}; struct fi_rx_attr peer_srx_attr = {0}; - struct fid_ep *peer_srx_ep = NULL; struct util_srx_ctx *srx_ctx; + bool create_user_recv_qp = false; switch (command) { case FI_ENABLE: @@ -1265,15 +1177,17 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * TODO: Distinguish between inline data sizes for RDMA {send,write} * when supported */ - if (ep->use_zcpy_rx) - ep->inject_size = MIN(ep->inject_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); - - ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); - if (ret) - return ret; + if (ep->use_zcpy_rx) { + ep->base_ep.inject_msg_size = + MIN(ep->base_ep.inject_msg_size, + efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + ep->base_ep.inject_rma_size = + MIN(ep->base_ep.inject_rma_size, + efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + create_user_recv_qp = true; + } - /* efa_base_ep_enable destroys qp in the error path */ - ret = efa_base_ep_enable(&ep->base_ep); + ret = efa_base_ep_create_and_enable_qp(&ep->base_ep, create_user_recv_qp); if (ret) return ret; @@ -1281,7 +1195,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) goto err_destroy_qp; - ret = efa_rdm_ep_insert_cntr_ibv_cq_poll_list(ep); + ret = efa_base_ep_insert_cntr_ibv_cq_poll_list(&ep->base_ep); if (ret) goto err_destroy_qp; @@ -1292,7 +1206,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) efa_rdm_ep_set_extra_info(ep); ep_addr_strlen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &ep_addr_strlen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &ep_addr_strlen); EFA_INFO(FI_LOG_EP_CTRL, "libfabric %s efa endpoint created! address: %s\n", fi_tostr("1", FI_TYPE_VERSION), ep_addr_str); @@ -1304,26 +1218,36 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * shared memory region. */ if (ep->shm_ep) { - peer_srx_context.srx = util_get_peer_srx(ep->peer_srx_ep); + ep->shm_peer_srx = calloc(1, sizeof(*ep->shm_peer_srx)); + if (!ep->shm_peer_srx) { + ret = -FI_ENOMEM; + goto err_close_shm; + } + memcpy(ep->shm_peer_srx, util_get_peer_srx(ep->peer_srx_ep), + sizeof(*ep->shm_peer_srx)); + + peer_srx_context.size = sizeof(peer_srx_context); + peer_srx_context.srx = ep->shm_peer_srx; + peer_srx_attr.op_flags |= FI_PEER; ret = fi_srx_context(efa_rdm_ep_domain(ep)->shm_domain, - &peer_srx_attr, &peer_srx_ep, &peer_srx_context); + &peer_srx_attr, &ep->shm_srx, &peer_srx_context); if (ret) - goto err_unlock; + goto err_close_shm; shm_ep_name_len = EFA_SHM_NAME_MAX; ret = efa_shm_ep_name_construct(shm_ep_name, &shm_ep_name_len, &ep->base_ep.src_addr); if (ret < 0) - goto err_unlock; + goto err_close_shm; fi_setname(&ep->shm_ep->fid, shm_ep_name, shm_ep_name_len); /* Bind srx to shm ep */ - ret = fi_ep_bind(ep->shm_ep, &ep->peer_srx_ep->fid, 0); + ret = fi_ep_bind(ep->shm_ep, &ep->shm_srx->fid, 0); if (ret) - goto err_unlock; + goto err_close_shm; ret = fi_enable(ep->shm_ep); if (ret) - goto err_unlock; + goto err_close_shm; } ofi_genlock_unlock(srx_ctx->lock); break; @@ -1334,7 +1258,8 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) return ret; -err_unlock: +err_close_shm: + efa_rdm_ep_close_shm_ep_resources(ep); ofi_genlock_unlock(srx_ctx->lock); err_destroy_qp: efa_base_ep_destruct_qp(&ep->base_ep); @@ -1383,7 +1308,9 @@ static int efa_rdm_ep_set_fi_hmem_p2p_opt(struct efa_rdm_ep *efa_rdm_ep, int opt * tighter restrictions on valid p2p options. */ EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) { - err = efa_domain_hmem_validate_p2p_opt(efa_rdm_ep_domain(efa_rdm_ep), efa_hmem_ifaces[i], opt); + err = efa_hmem_validate_p2p_opt( + efa_hmem_ifaces[i], opt, + efa_rdm_ep->base_ep.info->fabric_attr->api_version); if (err == -FI_ENODATA) continue; @@ -1419,7 +1346,7 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap /* CUDA memory can be supported by using either peer to peer or CUDA API. If neither is * available, we cannot support CUDA memory */ - if (!efa_rdm_ep_domain(ep)->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) + if (!g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) return -FI_EOPNOTSUPP; ep->cuda_api_permitted = false; @@ -1437,7 +1364,7 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool shm_permitted) { if (!shm_permitted) { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "FI_OPT_SHARED_MEMORY_PERMITTED set to false\n"); ep->shm_permitted = false; return FI_SUCCESS; @@ -1455,110 +1382,6 @@ static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool sh return 0; } -/** - * @brief Conditionally set efa_rdm_ep#max_msg_size per user's request - * - * If the requested msg size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] max_msg_size Requested maximum msg size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_MAX_MSG_SIZE - */ -static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_size) -{ - if (max_msg_size > ep->user_info->ep_attr->max_msg_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_MAX_MSG_SIZE " - "exceeds the maximum (%zu)\n", - max_msg_size, ep->user_info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - ep->max_msg_size = max_msg_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#max_rma_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] max_rma_size Requested max RMA size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_MAX_RMA_SIZE - */ -static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_size) -{ - if (max_rma_size > ep->user_info->ep_attr->max_msg_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_MAX_RMA_SIZE " - "exceeds the maximum (%zu)\n", - max_rma_size, ep->user_info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - ep->max_rma_size = max_rma_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#inject_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] inject_size Requested inject size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_INJECT_MSG_SIZE - */ -static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_msg_size) -{ - if (inject_msg_size > ep->user_info->tx_attr->inject_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_INJECT_MSG_SIZE " - "exceeds the maximum (%zu)\n", - inject_msg_size, ep->user_info->tx_attr->inject_size); - return -FI_EINVAL; - } - ep->inject_size = inject_msg_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#inject_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] inject_size Requested inject size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_INJECT_RMA_SIZE - */ -static int efa_rdm_ep_set_inject_rma_size(struct efa_rdm_ep *ep, size_t inject_rma_size) -{ - if (inject_rma_size > ep->user_info->tx_attr->inject_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_INJECT_RMA_SIZE " - "exceeds the maximum (%zu)\n", - inject_rma_size, ep->user_info->tx_attr->inject_size); - return -FI_EINVAL; - } - ep->inject_size = inject_rma_size; - return 0; -} - /** * @brief set use_device_rdma flag in efa_rdm_ep. * @@ -1628,55 +1451,6 @@ static int efa_rdm_ep_set_use_device_rdma(struct efa_rdm_ep *ep, bool use_device return 0; } -/** - * @brief check the in order aligned 128 bytes support for a given ibv_wr_op code - * - * @param ep efa_rdm_ep - * @param op_code ibv wr op code - * @return int 0 if in order aligned 128 bytes is supported, -FI_EOPNOTSUPP if - * it is not supported. Other negative integer for other errors. - */ -static -int efa_rdm_ep_check_qp_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, - enum ibv_wr_opcode op_code) -{ - struct efa_qp *qp = NULL; - struct ibv_qp_init_attr_ex attr_ex = {0}; - int ret, retv; - struct ibv_cq_ex *ibv_cq_ex = NULL; - enum ibv_cq_ex_type ibv_cq_ex_type; - struct fi_cq_attr cq_attr = {0}; - - ret = efa_cq_ibv_cq_ex_open(&cq_attr, efa_rdm_ep_domain(ep)->device->ibv_ctx, &ibv_cq_ex, &ibv_cq_ex_type); - if (ret) { - EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %d\n", ret); - ret = -FI_EINVAL; - goto out; - } - - /* Create a dummy qp for query only */ - efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); - - ret = efa_qp_create(&qp, &attr_ex); - if (ret) - goto out; - - if (!efa_qp_support_op_in_order_aligned_128_bytes(qp, op_code)) - ret = -FI_EOPNOTSUPP; - -out: - if (qp) - efa_qp_destruct(qp); - - if (ibv_cq_ex) { - retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(ibv_cq_ex)); - if (retv) - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv cq: %s\n", - fi_strerror(-retv)); - } - return ret; -} - /** * @brief implement the fi_setopt() API for EFA RDM endpoint * @param[in] fid fid to endpoint @@ -1692,7 +1466,6 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, { struct efa_rdm_ep *efa_rdm_ep; int intval, ret; - struct util_srx_ctx *srx; efa_rdm_ep = container_of(fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); @@ -1705,8 +1478,6 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return -FI_EINVAL; efa_rdm_ep->min_multi_recv_size = *(size_t *)optval; - srx = util_get_peer_srx(efa_rdm_ep->peer_srx_ep)->ep_fid.fid.context; - srx->min_multi_recv_size = *(size_t *)optval; break; case FI_OPT_EFA_RNR_RETRY: if (optlen != sizeof(size_t)) @@ -1760,32 +1531,28 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return ret; break; case FI_OPT_MAX_MSG_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_max_msg_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->base_ep.max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + break; + case FI_OPT_MAX_TAGGED_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_TAGGED_SIZE, efa_rdm_ep->max_tagged_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_RMA_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_max_rma_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->base_ep.max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + break; + case FI_OPT_MAX_ATOMIC_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_ATOMIC_SIZE, efa_rdm_ep->max_atomic_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_INJECT_MSG_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_inject_msg_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->base_ep.inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + break; + case FI_OPT_INJECT_TAGGED_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_TAGGED_SIZE, efa_rdm_ep->inject_tagged_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_RMA_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_inject_rma_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->base_ep.inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + break; + case FI_OPT_INJECT_ATOMIC_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_ATOMIC_SIZE, efa_rdm_ep->inject_atomic_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_EFA_USE_DEVICE_RDMA: if (optlen != sizeof(bool)) @@ -1802,7 +1569,7 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, * application buffer on device */ if (*(bool *)optval) { - ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_READ); + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(&efa_rdm_ep->base_ep, IBV_WR_RDMA_READ); if (ret) return ret; } @@ -1812,14 +1579,14 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, if (optlen != sizeof(bool)) return -FI_EINVAL; if (*(bool *)optval) { - ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_WRITE); + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(&efa_rdm_ep->base_ep, IBV_WR_RDMA_WRITE); if (ret) return ret; } efa_rdm_ep->write_in_order_aligned_128_bytes = *(bool *)optval; break; default: - EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); + EFA_INFO(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } @@ -1866,25 +1633,49 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_MAX_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->max_msg_size; + *(size_t *) optval = efa_rdm_ep->base_ep.max_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_MAX_TAGGED_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->max_tagged_size; *optlen = sizeof (size_t); break; case FI_OPT_MAX_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->max_rma_size; + *(size_t *) optval = efa_rdm_ep->base_ep.max_rma_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_MAX_ATOMIC_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->max_atomic_size; *optlen = sizeof (size_t); break; case FI_OPT_INJECT_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_size; + *(size_t *) optval = efa_rdm_ep->base_ep.inject_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_TAGGED_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->inject_tagged_size; *optlen = sizeof (size_t); break; case FI_OPT_INJECT_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_size; + *(size_t *) optval = efa_rdm_ep->base_ep.inject_rma_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_ATOMIC_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->inject_atomic_size; *optlen = sizeof (size_t); break; case FI_OPT_EFA_EMULATED_READ: @@ -1905,26 +1696,8 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, *(bool *)optval = true; *optlen = sizeof(bool); break; - case FI_OPT_EFA_USE_DEVICE_RDMA: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->use_device_rdma; - *optlen = sizeof(bool); - break; - case FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->sendrecv_in_order_aligned_128_bytes; - *optlen = sizeof(bool); - break; - case FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->write_in_order_aligned_128_bytes; - *optlen = sizeof(bool); - break; default: - EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); + EFA_INFO(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index bf98cb08ef3..9cf297acfc2 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -25,31 +25,6 @@ struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep) return &ep->base_ep.src_addr; } -const char *efa_rdm_ep_raw_addr_str(struct efa_rdm_ep *ep, char *buf, size_t *buflen) -{ - return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_rdm_ep_raw_addr(ep)); -} - -/** - * @brief return peer's raw address in #efa_ep_addr - * - * @param[in] ep end point - * @param[in] addr libfabric address - * @returns - * If peer exists, return peer's raw addrress as pointer to #efa_ep_addr; - * Otherwise, return NULL - * @relates efa_rdm_peer - */ -struct efa_ep_addr *efa_rdm_ep_get_peer_raw_addr(struct efa_rdm_ep *ep, fi_addr_t addr) -{ - struct efa_av *efa_av; - struct efa_conn *efa_conn; - - efa_av = ep->base_ep.av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ep_addr : NULL; -} - /** * @brief return peer's ahn * @@ -69,21 +44,6 @@ int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr) return efa_conn ? efa_conn->ah->ahn : -1; } -/** - * @brief return peer's raw address in a reable string - * - * @param[in] ep end point - * @param[in] addr libfabric address - * @param[out] buf a buffer tat to be used to store string - * @param[in,out] buflen length of `buf` as input. length of the string as output. - * @relates efa_rdm_peer - * @return a string with peer's raw address - */ -const char *efa_rdm_ep_get_peer_raw_addr_str(struct efa_rdm_ep *ep, fi_addr_t addr, char *buf, size_t *buflen) -{ - return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_rdm_ep_get_peer_raw_addr(ep, addr)); -} - /** * @brief get pointer to efa_rdm_peer structure for a given libfabric address * @@ -205,19 +165,17 @@ struct efa_rdm_ope *efa_rdm_ep_alloc_rxe(struct efa_rdm_ep *ep, fi_addr_t addr, * @param[in] rxe rxe that contain user buffer information * @param[in] flags user supplied flags passed to fi_recv */ -int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe, size_t flags) +int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe, uint64_t flags) { struct efa_rdm_pke *pkt_entry = NULL; size_t rx_iov_offset = 0; int err, rx_iov_index = 0; - assert(rxe->iov_count > 0 && rxe->iov_count <= ep->rx_iov_limit); + assert(rxe->iov_count > 0 && rxe->iov_count <= ep->base_ep.info->rx_attr->iov_limit); assert(rxe->iov[0].iov_len >= ep->msg_prefix_size); pkt_entry = efa_rdm_pke_alloc(ep, ep->user_rx_pkt_pool, EFA_RDM_PKE_FROM_USER_RX_POOL); - if (OFI_UNLIKELY(!pkt_entry)) { - EFA_WARN(FI_LOG_EP_DATA, "Failed to allocate pkt_entry for user rx\n"); - return -FI_ENOMEM; - } + if (OFI_UNLIKELY(!pkt_entry)) + return -FI_EAGAIN; pkt_entry->ope = rxe; rxe->state = EFA_RDM_RXE_MATCHED; @@ -244,7 +202,7 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe pkt_entry->payload_mr = rxe->desc[rx_iov_index]; pkt_entry->payload_size = ofi_total_iov_len(&rxe->iov[rx_iov_index], rxe->iov_count - rx_iov_index) - rx_iov_offset; - err = efa_rdm_pke_recvv(&pkt_entry, 1); + err = efa_rdm_pke_user_recvv(&pkt_entry, 1, flags); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_EP_CTRL, "failed to post user supplied buffer %d (%s)\n", -err, @@ -743,7 +701,11 @@ int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep) { int i, err; - if (ep->efa_rx_pkts_to_post == 0) + /** + * When efa_env.internal_rx_refill_threshold > efa_rdm_ep_get_rx_pool_size(ep), + * we should always refill when the pool is empty. + */ + if (ep->efa_rx_pkts_to_post < MIN(efa_env.internal_rx_refill_threshold, efa_rdm_ep_get_rx_pool_size(ep))) return 0; assert(ep->efa_rx_pkts_to_post + ep->efa_rx_pkts_posted <= ep->efa_max_outstanding_rx_ops); @@ -843,16 +805,6 @@ int efa_rdm_ep_grow_rx_pools(struct efa_rdm_ep *ep) } } - if (ep->use_zcpy_rx) { - err = ofi_bufpool_grow(ep->user_rx_pkt_pool); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_CQ, - "cannot allocate memory for user recv pkt pool. error: %s\n", - strerror(-err)); - return err; - } - } - return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index a19040c9b93..ad65781142e 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -60,7 +60,6 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx int tagged; int eager_rtm, medium_rtm, longcts_rtm, readbase_rtm, iface; size_t eager_rtm_max_data_size; - struct efa_hmem_info *hmem_info; bool delivery_complete_requested; assert(txe->op == ofi_op_msg || txe->op == ofi_op_tagged); @@ -68,7 +67,6 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx assert(tagged == 0 || tagged == 1); iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM; - hmem_info = efa_rdm_ep_domain(efa_rdm_ep)->hmem_info; if (txe->fi_flags & FI_INJECT || efa_both_support_zero_hdr_data_transfer(efa_rdm_ep, txe->peer)) delivery_complete_requested = false; @@ -88,15 +86,16 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx readbase_rtm = efa_rdm_peer_select_readbase_rtm(txe->peer, efa_rdm_ep, txe); - if (txe->total_len >= hmem_info[iface].min_read_msg_size && - efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && - (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) + if (use_p2p && + txe->total_len >= g_efa_hmem_info[iface].min_read_msg_size && + efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && + (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) return readbase_rtm; if (txe->total_len <= eager_rtm_max_data_size) return eager_rtm; - if (txe->total_len <= hmem_info[iface].max_medium_msg_size) + if (txe->total_len <= g_efa_hmem_info[iface].max_medium_msg_size) return medium_rtm; return longcts_rtm; @@ -164,9 +163,12 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee struct efa_rdm_ope *txe; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(send_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + srx_ctx = efa_rdm_ep_get_peer_srx_ctx(ep); - assert(msg->iov_count <= ep->tx_iov_limit); + assert(msg->iov_count <= ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(ep, perf_efa_tx); ofi_genlock_lock(srx_ctx->lock); @@ -193,8 +195,6 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee efa_rdm_tracepoint(send_begin, txe->msg_id, (size_t) txe->cq_entry.op_context, txe->total_len); - efa_rdm_tracepoint(send_begin_msg_context, - (size_t) msg->context, (size_t) msg->addr); err = efa_rdm_msg_post_rtm(ep, txe); if (OFI_UNLIKELY(err)) { @@ -291,7 +291,7 @@ ssize_t efa_rdm_msg_send(struct fid_ep *ep, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -323,7 +323,7 @@ ssize_t efa_rdm_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -355,7 +355,7 @@ ssize_t efa_rdm_msg_inject(struct fid_ep *ep, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->base_ep.inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -383,7 +383,7 @@ ssize_t efa_rdm_msg_injectdata(struct fid_ep *ep, const void *buf, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->base_ep.inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -493,7 +493,7 @@ ssize_t efa_rdm_msg_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -526,7 +526,7 @@ ssize_t efa_rdm_msg_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len int ret; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -558,7 +558,7 @@ ssize_t efa_rdm_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_tagged_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -585,7 +585,7 @@ ssize_t efa_rdm_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t l struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_tagged_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -892,7 +892,7 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg struct efa_rdm_ope *rxe; struct util_srx_ctx *srx_ctx; - assert(msg->iov_count <= ep->rx_iov_limit); + assert(msg->iov_count <= ep->base_ep.info->rx_attr->iov_limit); efa_perfset_start(ep, perf_efa_recv); @@ -918,6 +918,9 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg } ret = efa_rdm_ep_post_user_recv_buf(ep, rxe, flags); + if (OFI_UNLIKELY(ret)) + efa_rdm_rxe_release(rxe); + ofi_genlock_unlock(srx_ctx->lock); } else if (op == ofi_op_tagged) { ret = util_srx_generic_trecv(ep->peer_srx_ep, msg->msg_iov, msg->desc, diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 5002c938f45..58a0f51ecaa 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -58,7 +58,7 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe, txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count); txe->cq_entry.buf = OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL; - if (ep->user_info->mode & FI_MSG_PREFIX) { + if (ep->base_ep.info->mode & FI_MSG_PREFIX) { ofi_consume_iov_desc(txe->iov, txe->desc, &txe->iov_count, ep->msg_prefix_size); } txe->total_len = ofi_total_iov_len(txe->iov, txe->iov_count); @@ -556,6 +556,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; assert(rxe->type == EFA_RDM_RXE); @@ -602,9 +603,11 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) err_entry.buf = rxe->cq_entry.buf; err_entry.data = rxe->cq_entry.data; err_entry.tag = rxe->cq_entry.tag; - if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, err, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, prov_errno, + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", @@ -660,6 +663,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; ep = txe->ep; memset(&err_entry, 0, sizeof(err_entry)); @@ -694,9 +698,11 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) err_entry.buf = txe->cq_entry.buf; err_entry.data = txe->cq_entry.data; err_entry.tag = txe->cq_entry.tag; - if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, err, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, prov_errno, + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", @@ -1066,6 +1072,12 @@ void efa_rdm_ope_handle_recv_completed(struct efa_rdm_ope *ope) efa_rdm_rxe_report_completion(rxe); } + if (ope->internal_flags & EFA_RDM_OPE_READ_NACK) { + assert(ope->type == EFA_RDM_RXE); + /* Apply to both DC and non-DC */ + efa_rdm_rxe_map_remove(&ope->ep->rxe_map, ope->msg_id, ope->peer->efa_fiaddr, ope); + } + /* As can be seen, this function does not release rxe when * efa_rdm_ope_post_send_or_queue() was successful. * @@ -1106,9 +1118,6 @@ void efa_rdm_ope_handle_recv_completed(struct efa_rdm_ope *ope) return; } - if (ope->internal_flags & EFA_RDM_OPE_READ_NACK) - efa_rdm_rxe_map_remove(&ope->ep->rxe_map, ope->msg_id, ope->peer->efa_fiaddr, ope); - if (ope->type == EFA_RDM_TXE) { efa_rdm_txe_release(ope); } else { @@ -1506,7 +1515,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (ope->fi_flags & FI_INJECT) { assert(ope->iov_count == 1); - assert(ope->total_len <= ep->inject_size); + assert(ope->total_len <= ep->base_ep.inject_rma_size); copied = efa_rdm_pke_copy_from_hmem_iov( ope->desc[iov_idx], pkt_entry, ope, sizeof(struct efa_rdm_rma_context_pkt), 0, @@ -1773,6 +1782,8 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, int pkt_type, ssize_t err) { + bool delivery_complete_requested = ope->fi_flags & FI_DELIVERY_COMPLETE; + if (err == -FI_ENOMR) { /* Long read and runting read protocols could fail because of a * lack of memory registrations. In that case, we retry with @@ -1781,20 +1792,20 @@ ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, switch (pkt_type) { case EFA_RDM_LONGREAD_MSGRTM_PKT: case EFA_RDM_RUNTREAD_MSGRTM_PKT: - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS untagged " "protocol because memory registration limit " "was reached on the sender\n"); return efa_rdm_ope_post_send_or_queue( - ope, EFA_RDM_LONGCTS_MSGRTM_PKT); + ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_MSGRTM_PKT : EFA_RDM_LONGCTS_MSGRTM_PKT); case EFA_RDM_LONGREAD_TAGRTM_PKT: case EFA_RDM_RUNTREAD_TAGRTM_PKT: - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS tagged protocol " "because memory registration limit was " "reached on the sender\n"); return efa_rdm_ope_post_send_or_queue( - ope, EFA_RDM_LONGCTS_TAGRTM_PKT); + ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_TAGRTM_PKT : EFA_RDM_LONGCTS_TAGRTM_PKT); default: return err; } diff --git a/prov/efa/src/rdm/efa_rdm_ope.h b/prov/efa/src/rdm/efa_rdm_ope.h index d8483d96223..626eb6dc8d4 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.h +++ b/prov/efa/src/rdm/efa_rdm_ope.h @@ -110,7 +110,6 @@ struct efa_rdm_ope { size_t iov_count; struct iovec iov[EFA_RDM_IOV_LIMIT]; void *desc[EFA_RDM_IOV_LIMIT]; - void *shm_desc[EFA_RDM_IOV_LIMIT]; struct fid_mr *mr[EFA_RDM_IOV_LIMIT]; size_t rma_iov_count; @@ -144,8 +143,6 @@ struct efa_rdm_ope { uint64_t bytes_copied; uint64_t bytes_queued_blocking_copy; - /* linked to peer->rx_unexp_list or peer->rx_unexp_tagged_list */ - struct dlist_entry peer_unexp_entry; #if ENABLE_DEBUG /* linked with ope_recv_list in efa_rdm_ep */ struct dlist_entry pending_recv_entry; diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 9674a642be6..3e8e3dff774 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -330,18 +330,16 @@ void efa_rdm_peer_proc_pending_items_in_robuf(struct efa_rdm_peer *peer, struct size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope) { - struct efa_hmem_info *hmem_info; size_t runt_size; size_t memory_alignment; int iface; - hmem_info = efa_rdm_ep_domain(ep)->hmem_info; iface = ope->desc[0] ? ((struct efa_mr*) ope->desc[0])->peer.iface : FI_HMEM_SYSTEM; - if (hmem_info[iface].runt_size < peer->num_runt_bytes_in_flight) + if (g_efa_hmem_info[iface].runt_size < peer->num_runt_bytes_in_flight) return 0; - runt_size = MIN(hmem_info[iface].runt_size - peer->num_runt_bytes_in_flight, ope->total_len); + runt_size = MIN(g_efa_hmem_info[iface].runt_size - peer->num_runt_bytes_in_flight, ope->total_len); memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface); /* * runt size must be aligned because: diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index f6058883aed..fe2f79ead61 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -60,8 +60,6 @@ struct efa_rdm_peer { int rnr_queued_pkt_cnt; /**< queued RNR packet count */ struct dlist_entry rnr_backoff_entry; /**< linked to efa_domain->peer_backoff_list */ struct dlist_entry handshake_queued_entry; /**< linked with efa_domain->handshake_queued_peer_list */ - struct dlist_entry rx_unexp_list; /**< a list of unexpected untagged rxe for this peer */ - struct dlist_entry rx_unexp_tagged_list; /**< a list of unexpected tagged rxe for this peer */ struct dlist_entry txe_list; /**< a list of txe related to this peer */ struct dlist_entry rxe_list; /**< a list of rxe relased to this peer */ struct dlist_entry overflow_pke_list; /**< a list of out-of-order pke that overflow the current recvwin */ @@ -111,6 +109,23 @@ bool efa_rdm_peer_support_rdma_write(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_RDMA_WRITE); } +/** + * @brief check for peer's unsolicited write support, assuming HANDSHAKE has already occurred + * + * @param[in] peer A peer which we have already received a HANDSHAKE from + * @return bool The peer's unsolicited write recv support + */ +static inline +bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) +{ + /* Unsolicited write recv is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && + (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); +} + static inline bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 8255931b8d9..06e7e2abd7a 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -439,7 +439,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, #endif #if HAVE_LTTNG - efa_tracepoint_wr_id_post_send((void *)pkt_entry); + efa_rdm_tracepoint_wr_id_post_send((void *)pkt_entry); #endif } @@ -509,6 +509,10 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, conn->ep_addr->qpn, conn->ep_addr->qkey); } +#if HAVE_LTTNG + efa_rdm_tracepoint_wr_id_post_read((void *)pkt_entry); +#endif + err = ibv_wr_complete(qp->ibv_qp_ex); if (OFI_UNLIKELY(err)) @@ -597,6 +601,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) conn->ep_addr->qpn, conn->ep_addr->qkey); } +#if HAVE_LTTNG + efa_rdm_tracepoint_wr_id_post_write((void *)pkt_entry); +#endif + if (!(txe->fi_flags & FI_MORE)) { err = ibv_wr_complete(qp->ibv_qp_ex); ep->base_ep.is_wr_started = false; @@ -622,7 +630,7 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, { struct efa_rdm_ep *ep; struct ibv_recv_wr *bad_wr; - struct ibv_qp *qp; + struct efa_recv_wr *recv_wr; int i, err; assert(pke_cnt); @@ -631,37 +639,83 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, assert(ep); for (i = 0; i < pke_cnt; ++i) { - ep->base_ep.efa_recv_wr_vec[i].wr.wr_id = (uintptr_t)pke_vec[i]; - ep->base_ep.efa_recv_wr_vec[i].wr.num_sge = 1; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list = ep->base_ep.efa_recv_wr_vec[i].sge; - if (pke_vec[i]->alloc_type == EFA_RDM_PKE_FROM_USER_RX_POOL) { - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].addr = (uintptr_t) pke_vec[i]->payload; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->payload_size; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->payload_mr)->ibv_mr->lkey; - } else { - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->pkt_size; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->mr)->ibv_mr->lkey; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].addr = (uintptr_t)pke_vec[i]->wiredata; - } - ep->base_ep.efa_recv_wr_vec[i].wr.next = NULL; + recv_wr = &ep->base_ep.efa_recv_wr_vec[i]; + recv_wr->wr.wr_id = (uintptr_t)pke_vec[i]; + recv_wr->wr.num_sge = 1; + recv_wr->wr.sg_list = recv_wr->sge; + recv_wr->wr.sg_list[0].length = pke_vec[i]->pkt_size; + recv_wr->wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->mr)->ibv_mr->lkey; + recv_wr->wr.sg_list[0].addr = (uintptr_t)pke_vec[i]->wiredata; + recv_wr->wr.next = NULL; if (i > 0) - ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &ep->base_ep.efa_recv_wr_vec[i].wr; + ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &recv_wr->wr; #if HAVE_LTTNG - efa_tracepoint_wr_id_post_recv(pke_vec[i]); + efa_rdm_tracepoint_wr_id_post_recv(pke_vec[i]); #endif } - if (pke_vec[0]->alloc_type == EFA_RDM_PKE_FROM_USER_RX_POOL) { - assert(ep->base_ep.user_recv_qp); - qp = ep->base_ep.user_recv_qp->ibv_qp; - } else { - qp = ep->base_ep.qp->ibv_qp; + err = ibv_post_recv(ep->base_ep.qp->ibv_qp, &ep->base_ep.efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(err)) + err = (err == ENOMEM) ? -FI_EAGAIN : -err; + + return err; +} + +/** + * @brief Post user receive requests to EFA device through user_recv_qp + * + * @param[in] pke_vec packet entries that contains information of receive buffer + * @param[in] pke_cnt Number of packet entries to post receive requests for + * @param[in] flags user supplied flags passed to fi_recv, support FI_MORE + * @return 0 on success + * On error, a negative value corresponding to fabric errno + */ +ssize_t efa_rdm_pke_user_recvv(struct efa_rdm_pke **pke_vec, + int pke_cnt, uint64_t flags) +{ + struct efa_rdm_ep *ep; + struct ibv_recv_wr *bad_wr; + struct efa_recv_wr *recv_wr; + int i, err; + size_t wr_index; + + assert(pke_cnt); + + ep = pke_vec[0]->ep; + assert(ep); + + wr_index = ep->base_ep.recv_wr_index; + assert(wr_index < ep->base_ep.info->rx_attr->size); + + for (i = 0; i < pke_cnt; ++i) { + recv_wr = &ep->base_ep.user_recv_wr_vec[wr_index]; + recv_wr->wr.wr_id = (uintptr_t) pke_vec[i]; + recv_wr->wr.num_sge = 1; + recv_wr->wr.sg_list = recv_wr->sge; + recv_wr->wr.sg_list[0].addr = (uintptr_t) pke_vec[i]->payload; + recv_wr->wr.sg_list[0].length = pke_vec[i]->payload_size; + recv_wr->wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->payload_mr)->ibv_mr->lkey; + recv_wr->wr.next = NULL; + if (wr_index > 0) + ep->base_ep.user_recv_wr_vec[wr_index - 1].wr.next = &recv_wr->wr; +#if HAVE_LTTNG + efa_rdm_tracepoint_wr_id_post_recv(pke_vec[i]); +#endif + wr_index++; } - err = ibv_post_recv(qp, &ep->base_ep.efa_recv_wr_vec[0].wr, &bad_wr); - if (OFI_UNLIKELY(err)) { + ep->base_ep.recv_wr_index = wr_index; + + if (flags & FI_MORE) + return 0; + + assert(ep->base_ep.user_recv_qp); + err = ibv_post_recv(ep->base_ep.user_recv_qp->ibv_qp, &ep->base_ep.user_recv_wr_vec[0].wr, &bad_wr); + + if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; - } + + ep->base_ep.recv_wr_index = 0; return err; } diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 7291a36c466..3bd0e51390d 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -195,7 +195,7 @@ struct efa_rdm_pke { _Alignas(EFA_RDM_PKE_ALIGNMENT) char wiredata[0]; }; -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) static_assert(sizeof (struct efa_rdm_pke) % EFA_RDM_PKE_ALIGNMENT == 0, "efa_rdm_pke alignment check"); #endif @@ -237,5 +237,8 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, int pke_cnt); +ssize_t efa_rdm_pke_user_recvv(struct efa_rdm_pke **pke_vec, + int pke_cnt, uint64_t flags); + int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry); #endif diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index 97741ebbd27..b8baf5c2935 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -112,14 +112,18 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, /* The data_offset will be non-zero when the long CTS RTM packet * is sent to continue a runting read transfer after the * receiver has run out of memory registrations */ - assert((data_offset == 0 || ope->internal_flags & EFA_RDM_OPE_READ_NACK) && data_size == -1); + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_longcts_msgrtm(pkt_entry, ope); break; case EFA_RDM_LONGCTS_TAGRTM_PKT: /* The data_offset will be non-zero when the long CTS RTM packet * is sent to continue a runting read transfer after the * receiver has run out of memory registrations */ - assert((data_offset == 0 || ope->internal_flags & EFA_RDM_OPE_READ_NACK) && data_size == -1); + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_longcts_tagrtm(pkt_entry, ope); break; case EFA_RDM_LONGREAD_MSGRTM_PKT: @@ -187,11 +191,21 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, ret = efa_rdm_pke_init_dc_medium_tagrtm(pkt_entry, ope, data_offset, data_size); break; case EFA_RDM_DC_LONGCTS_MSGRTM_PKT: - assert(data_offset == 0 && data_size == -1); + /* The data_offset will be non-zero when the DC long CTS RTM packet + * is sent to continue a runting read transfer after the + * receiver has run out of memory registrations */ + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_dc_longcts_msgrtm(pkt_entry, ope); break; case EFA_RDM_DC_LONGCTS_TAGRTM_PKT: - assert(data_offset == 0 && data_size == -1); + /* The data_offset will be non-zero when the DC long CTS tagged RTM packet + * is sent to continue a runting read transfer after the + * receiver has run out of memory registrations */ + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_dc_longcts_tagrtm(pkt_entry, ope); break; case EFA_RDM_DC_EAGER_RTW_PKT: @@ -439,9 +453,9 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) memset(&ep_addr_str, 0, sizeof(ep_addr_str)); memset(&peer_addr_str, 0, sizeof(peer_addr_str)); buflen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &buflen); buflen = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, pkt_entry->addr, peer_addr_str, &buflen); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, pkt_entry->addr, peer_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "While sending a handshake packet, an error occurred." " Our address: %s, peer address: %s\n", @@ -698,7 +712,7 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) memset(&ep_addr_str, 0, sizeof(ep_addr_str)); buflen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "Packet receive error from non TX/RX packet. Our address: %s\n", ep_addr_str); @@ -737,7 +751,7 @@ fi_addr_t efa_rdm_pke_insert_addr(struct efa_rdm_pke *pkt_entry, void *raw_addr) char self_raw_addr_str[OFI_ADDRSTRLEN]; size_t buflen = OFI_ADDRSTRLEN; - efa_rdm_ep_raw_addr_str(ep, self_raw_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, self_raw_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "Host %s received a packet with invalid protocol version %d.\n" "This host can only support protocol version %d and above.\n", diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index c990eacedc9..b1b7be31460 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -690,6 +690,7 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_read_nack_hdr *nack_hdr; struct efa_rdm_ope *txe; + bool delivery_complete_requested; efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight -= 1; @@ -700,18 +701,34 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) efa_rdm_pke_release_rx(pkt_entry); txe->internal_flags |= EFA_RDM_OPE_READ_NACK; - if (txe->op == ofi_op_tagged) { - EFA_WARN(FI_LOG_EP_CTRL, + delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE; + + if (txe->op == ofi_op_write) { + EFA_INFO(FI_LOG_EP_CTRL, + "Sender fallback to emulated long CTS write " + "protocol because p2p is not available\n"); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_RTW_PKT : + EFA_RDM_LONGCTS_RTW_PKT); + } else if (txe->op == ofi_op_tagged) { + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS tagged " "protocol because memory registration limit " "was reached on the receiver\n"); - efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_TAGRTM_PKT); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_TAGRTM_PKT : + EFA_RDM_LONGCTS_TAGRTM_PKT); } else { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS untagged " "protocol because memory registration limit " "was reached on the receiver\n"); - efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_MSGRTM_PKT); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_MSGRTM_PKT : + EFA_RDM_LONGCTS_MSGRTM_PKT); } } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index 4d8dc735e4b..a96494f02a4 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -857,14 +857,12 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) struct efa_rdm_ep *ep; struct efa_rdm_ope *rxe; struct efa_rdm_pke *cur, *nxt; - struct efa_rdm_peer *peer; int pkt_type; ssize_t ret, err; uint64_t msg_id; ep = pkt_entry->ep; rxe = pkt_entry->ope; - peer = rxe->peer; pkt_type = efa_rdm_pke_get_base_hdr(pkt_entry)->type; ret = 0; @@ -883,20 +881,9 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) efa_rdm_tracepoint(runtread_read_posted, rxe->msg_id, (size_t) rxe->cq_entry.op_context, rxe->total_len); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); - if (err) { - if (err == -FI_ENOMR) { - if (efa_rdm_peer_support_read_nack(peer)) - /* Only set the flag here. The NACK - * packet is sent after all runting read - * RTM packets have been received */ - rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; - else - ret = -FI_EAGAIN; - } else { - return err; - } - } + err = efa_rdm_pke_post_remote_read_or_nack(ep, pkt_entry, rxe); + if (err) + return err; } } @@ -912,7 +899,7 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) if (efa_rdm_ope_mulreq_total_data_size(rxe, pkt_type) == rxe->bytes_received_via_mulreq) { if (rxe->internal_flags & EFA_RDM_OPE_READ_NACK) { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Receiver sending long read NACK " "packet because memory registration " "limit was reached on the receiver\n"); @@ -1198,12 +1185,10 @@ ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry) struct efa_rdm_longread_rtm_base_hdr *rtm_hdr; struct fi_rma_iov *read_iov; struct efa_rdm_ep *ep; - struct efa_rdm_peer *peer; int err; rxe = pkt_entry->ope; ep = rxe->ep; - peer = rxe->peer; rtm_hdr = efa_rdm_pke_get_longread_rtm_base_hdr(pkt_entry); read_iov = (struct fi_rma_iov *)(pkt_entry->wiredata + efa_rdm_pke_get_req_hdr_size(pkt_entry)); @@ -1216,24 +1201,8 @@ ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry) efa_rdm_tracepoint(longread_read_posted, rxe->msg_id, (size_t) rxe->cq_entry.op_context, rxe->total_len); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); - if (err == -FI_ENOMR) { - if (efa_rdm_peer_support_read_nack(peer)) { - EFA_WARN(FI_LOG_EP_CTRL, "Receiver sending long read " - "NACK packet because memory " - "registration limit was " - "reached on the receiver\n"); - efa_rdm_rxe_map_insert(&ep->rxe_map, pkt_entry, rxe); - rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; - err = efa_rdm_ope_post_send_or_queue( - rxe, EFA_RDM_READ_NACK_PKT); - } else { - /* Peer does not support the READ_NACK packet. So we - * return EAGAIN and hope that the app runs progress - * again which will free some MR registrations */ - err = -FI_EAGAIN; - } - } + err = efa_rdm_pke_post_remote_read_or_nack(ep, pkt_entry, rxe); + efa_rdm_pke_release_rx(pkt_entry); return err; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index c7dc43f2490..5872302136f 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -12,6 +12,7 @@ #include "efa_rdm_rma.h" #include "efa_rdm_ope.h" #include "efa_rdm_pke.h" +#include "efa_rdm_pke_rtw.h" #include "efa_rdm_pke_utils.h" #include "efa_rdm_protocol.h" #include "efa_rdm_pke_req.h" @@ -348,6 +349,18 @@ void efa_rdm_pke_handle_longcts_rtw_send_completion(struct efa_rdm_pke *pkt_entr { struct efa_rdm_ope *txe; + /** + * A zero-payload longcts rtw pkt currently should only happen when it's + * used for the READ NACK protocol. In this case, this pkt doesn't + * contribute to the send completion, and the associated tx entry + * may be released earlier as the CTSDATA pkts have already kicked off + * and finished the send. + */ + if (pkt_entry->payload_size == 0) { + assert(efa_rdm_pke_get_rtw_base_hdr(pkt_entry)->flags & EFA_RDM_REQ_READ_NACK); + return; + } + txe = pkt_entry->ope; txe->bytes_acked += pkt_entry->payload_size; if (txe->total_len == txe->bytes_acked) @@ -557,14 +570,14 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) memcpy(rxe->rma_iov, read_iov, rxe->rma_iov_count * sizeof(struct fi_rma_iov)); + err = efa_rdm_pke_post_remote_read_or_nack(rxe->ep, pkt_entry, rxe); + efa_rdm_pke_release_rx(pkt_entry); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RDMA post read or queue failed.\n"); efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST); efa_rdm_rxe_release(rxe); - efa_rdm_pke_release_rx(pkt_entry); } } diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.c b/prov/efa/src/rdm/efa_rdm_pke_utils.c index 49600a01707..a19b275d9e1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.c +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.c @@ -41,7 +41,7 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, size_t data_size) { int tx_iov_index, ret; - bool p2p_available; + bool mr_p2p_available; bool use_inline_buf; size_t tx_iov_offset, copied; struct efa_mr *iov_mr; @@ -62,14 +62,14 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, assert(tx_iov_index < ope->iov_count); assert(tx_iov_offset < ope->iov[tx_iov_index].iov_len); iov_mr = ope->desc[tx_iov_index]; - p2p_available = false; + mr_p2p_available = false; use_inline_buf = false; if (iov_mr) { ret = efa_rdm_ep_use_p2p(pke->ep, iov_mr); if (ret < 0) return ret; - p2p_available = ret; + mr_p2p_available = ret; } else if (!efa_mr_is_hmem(iov_mr) && payload_offset + data_size <= efa_rdm_ep_domain(pke->ep)->device->efa_attr.inline_buf_size) { use_inline_buf = true; @@ -85,7 +85,7 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, * a copy from the user buffer to the internal bounce buffer is needed. */ if (tx_iov_offset + data_size <= ope->iov[tx_iov_index].iov_len && - (use_inline_buf || (p2p_available && !(ope->fi_flags & FI_INJECT)))) { + (use_inline_buf || (mr_p2p_available && !(ope->fi_flags & FI_INJECT)))) { pke->payload = (char *)ope->iov[tx_iov_index].iov_base + tx_iov_offset; pke->payload_size = data_size; pke->payload_mr = ope->desc[tx_iov_index]; @@ -250,15 +250,16 @@ int efa_rdm_pke_get_available_copy_methods(struct efa_rdm_ep *ep, bool *restrict gdrcopy_available) { int ret; - bool p2p_available; + bool mr_p2p_available; + assert(efa_mr); ret = efa_rdm_ep_use_p2p(ep, efa_mr); if (ret < 0) { return ret; } - p2p_available = ret; - *local_read_available = p2p_available && efa_rdm_ep_support_rdma_read(ep); + mr_p2p_available = ret; + *local_read_available = mr_p2p_available && efa_rdm_ep_support_rdma_read(ep); *cuda_memcpy_available = ep->cuda_api_permitted; *gdrcopy_available = efa_mr->peer.flags & OFI_HMEM_DATA_DEV_REG_HANDLE; diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.h b/prov/efa/src/rdm/efa_rdm_pke_utils.h index 529749d9258..c7363965dc1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.h +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.h @@ -94,6 +94,78 @@ efa_rdm_pke_copy_from_hmem_iov(struct efa_mr *iov_mr, struct efa_rdm_pke *pke, return copied; } +/** + * @brief This function either posts RDMA read, or sends a NACK packet when p2p + * is not available or memory registration limit was reached on the receiver. + * + * @param[in] ep endpoint + * @param[in] pkt_entry packet entry + * @param[in] rxe RX entry + * + * @return 0 on success, or a negative error code. + */ +static inline int +efa_rdm_pke_post_remote_read_or_nack(struct efa_rdm_ep *ep, + struct efa_rdm_pke *pkt_entry, + struct efa_rdm_ope *rxe) +{ + int err = 0; + int pkt_type; + int p2p_avail; + + pkt_type = efa_rdm_pke_get_base_hdr(pkt_entry)->type; + err = efa_rdm_ep_use_p2p(ep, rxe->desc[0]); + if (err < 0) + return err; + + p2p_avail = err; + if (p2p_avail) { + err = efa_rdm_ope_post_remote_read_or_queue(rxe); + } else if (efa_rdm_peer_support_read_nack(rxe->peer)) { + EFA_INFO(FI_LOG_EP_CTRL, + "Receiver sending long read " + "NACK packet because P2P is not available, " + "unable to post RDMA read.\n"); + goto send_nack; + } else { + EFA_INFO(FI_LOG_EP_CTRL, "P2P is not available, " + "unable to post RDMA read.\n"); + return -FI_EOPNOTSUPP; + } + + if (err == -FI_ENOMR) { + if (efa_rdm_peer_support_read_nack(rxe->peer)) { + EFA_INFO(FI_LOG_EP_CTRL, "Receiver sending long read " + "NACK packet because memory " + "registration limit was " + "reached on the receiver.\n"); + goto send_nack; + } else { + /* Peer does not support the READ_NACK packet. So we + * return EAGAIN and hope that the app runs progress + * again which will free some MR registrations */ + return -FI_EAGAIN; + } + } + + return err; + +send_nack: + rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; + /* Only set the flag for runting read. The NACK + * packet is sent after all runting read + * RTM packets have been received */ + if (efa_rdm_pkt_type_is_runtread(pkt_type)) { + return 0; + } + + if (efa_rdm_pkt_type_is_rtm(pkt_type)) { + efa_rdm_rxe_map_insert(&ep->rxe_map, pkt_entry, rxe); + } + + return efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_READ_NACK_PKT); +} + size_t efa_rdm_pke_get_payload_offset(struct efa_rdm_pke *pkt_entry); ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index abcec6d091e..975cbd44e94 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -16,18 +16,7 @@ #define EFA_RDM_PROTOCOL_VERSION (4) -/* raw address format. (section 1.4) */ -#define EFA_GID_LEN 16 -struct efa_ep_addr { - uint8_t raw[EFA_GID_LEN]; - uint16_t qpn; - uint16_t pad; - uint32_t qkey; - struct efa_ep_addr *next; -}; - -#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) /* * Extra Feature/Request Flags (section 2.1) @@ -40,8 +29,14 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_RDMA_WRITE BIT_ULL(5) #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) -#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 -#define EFA_RDM_MAX_NUM_EXINFO (256) +#define EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV BIT_ULL(8) +#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 9 +/* + * The length of 64-bit extra_info array used in efa_rdm_ep + * and efa_rdm_peer + * 4 means 64*4=256 bits of extra features or requests + */ +#define EFA_RDM_MAX_NUM_EXINFO (4) /* * Packet type ID of each packet type (section 1.3) @@ -109,7 +104,7 @@ struct efa_ep_addr { #define EFA_RDM_RUNT_PKT_END 148 #define EFA_RDM_EXTRA_REQ_PKT_END 148 -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) #define EFA_RDM_ENSURE_HEADER_SIZE(hdr, size) \ static_assert(sizeof (struct hdr) == (size), #hdr " size check") #else diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 5ebecc3f97c..87267f6d8ae 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -11,6 +11,7 @@ #include "efa_rdm_rma.h" #include "efa_rdm_pke_cmd.h" #include "efa_cntr.h" +#include "efa_rdm_tracepoint.h" int efa_rdm_rma_verified_copy_iov(struct efa_rdm_ep *ep, struct efa_rma_iov *rma, size_t count, uint32_t flags, @@ -117,7 +118,8 @@ ssize_t efa_rdm_rma_post_efa_emulated_read(struct efa_rdm_ep *ep, struct efa_rdm ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) { bool use_device_read = false; - ssize_t ret; + int use_p2p; + ssize_t ret, err; /* * A handshake is required to choose the correct protocol (whether to use device read). @@ -127,7 +129,14 @@ ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) if (!(txe->peer->is_self) && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); - if (efa_rdm_interop_rdma_read(ep, txe->peer)) { + /* Check p2p support. Cannot use device read when p2p is not available. */ + err = efa_rdm_ep_use_p2p(ep, txe->desc[0]); + if (err < 0) + return err; + + use_p2p = err; + + if (use_p2p && efa_rdm_interop_rdma_read(ep, txe->peer)) { /* RDMA read interoperability check also checks domain.use_device_rdma, * so we do not check it here */ @@ -137,11 +146,6 @@ ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return -FI_EOPNOTSUPP; } - /* - * Not going to check efa_ep->hmem_p2p_opt here, if the remote side - * gave us a valid MR we should just honor the request even if p2p is - * disabled. - */ if (use_device_read) { ret = efa_rdm_ope_prepare_to_post_read(txe); if (ret) @@ -171,6 +175,9 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin void **tmp_desc; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(read_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + EFA_DBG(FI_LOG_EP_DATA, "read iov_len: %lu flags: %lx\n", ofi_total_iov_len(msg->msg_iov, msg->iov_count), @@ -186,7 +193,7 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin if (err) return err; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(efa_rdm_ep, perf_efa_tx); ofi_genlock_lock(srx_ctx->lock); @@ -289,7 +296,7 @@ ssize_t efa_rdm_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -351,8 +358,9 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) { ssize_t err; bool delivery_complete_requested; - int ctrl_type, iface; + int ctrl_type, iface, use_p2p; size_t max_eager_rtw_data_size; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; /* * A handshake is required to choose the correct protocol (whether to use device write/read). @@ -363,6 +371,24 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); if (efa_rdm_rma_should_write_using_rdma(ep, txe, txe->peer)) { + /** + * Unsolicited write recv is a feature that makes rdma-write with + * imm not consume an rx buffer on the responder side, and this + * feature requires consistent support status on both sides. + */ + if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && + (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, err_msg, "", EFA_ERROR_MSG_BUFFER_LENGTH); + EFA_WARN(FI_LOG_EP_DATA, + "Inconsistent support status detected on unsolicited write recv.\n" + "My support status: %d, peer support status: %d. %s.\n" + "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" + "Please use consistent software versions on both hosts, or disable the unsolicited write " + "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", + efa_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + err_msg); + return -FI_EOPNOTSUPP; + } efa_rdm_ope_prepare_to_post_write(txe); return efa_rdm_ope_post_remote_write(txe); } @@ -388,11 +414,18 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) max_eager_rtw_data_size = efa_rdm_txe_max_req_data_capacity(ep, txe, EFA_RDM_EAGER_RTW_PKT); } + err = efa_rdm_ep_use_p2p(ep, txe->desc[0]); + if (err < 0) + return err; + + use_p2p = err; + iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM; - if (txe->total_len >= efa_rdm_ep_domain(ep)->hmem_info[iface].min_read_write_size && - efa_rdm_interop_rdma_read(ep, txe->peer) && - (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(ep)))) { + if (use_p2p && + txe->total_len >= g_efa_hmem_info[iface].min_read_write_size && + efa_rdm_interop_rdma_read(ep, txe->peer) && + (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(ep)))) { err = efa_rdm_ope_post_send(txe, EFA_RDM_LONGREAD_RTW_PKT); if (err != -FI_ENOMEM) return err; @@ -420,6 +453,9 @@ static inline ssize_t efa_rdm_generic_writemsg(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *txe; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(write_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + efa_perfset_start(efa_rdm_ep, perf_efa_tx); EFA_DBG(FI_LOG_EP_DATA, @@ -470,7 +506,7 @@ ssize_t efa_rdm_rma_writemsg(struct fid_ep *ep, if (err) return err; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); @@ -550,7 +586,7 @@ ssize_t efa_rdm_rma_write(struct fid_ep *ep, const void *buf, size_t len, void * int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -585,7 +621,7 @@ ssize_t efa_rdm_rma_writedata(struct fid_ep *ep, const void *buf, size_t len, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -632,7 +668,7 @@ ssize_t efa_rdm_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->base_ep.inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -669,7 +705,7 @@ ssize_t efa_rdm_rma_inject_writedata(struct fid_ep *ep, const void *buf, size_t int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->base_ep.inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; diff --git a/prov/efa/src/rdm/efa_rdm_srx.c b/prov/efa/src/rdm/efa_rdm_srx.c index 47919dc5667..4efbe01da72 100644 --- a/prov/efa/src/rdm/efa_rdm_srx.c +++ b/prov/efa/src/rdm/efa_rdm_srx.c @@ -151,7 +151,7 @@ int efa_rdm_peer_srx_construct(struct efa_rdm_ep *ep) { int ret; ret = util_ep_srx_context(&efa_rdm_ep_domain(ep)->util_domain, - ep->rx_size, EFA_RDM_IOV_LIMIT, + ep->base_ep.info->rx_attr->size, EFA_RDM_IOV_LIMIT, ep->min_multi_recv_size, &efa_rdm_srx_update_mr, &efa_rdm_ep_domain(ep)->srx_lock, diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h index a11e8c3889c..24e2edec270 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h @@ -65,28 +65,38 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry, EFA_RDM_TP_PROV, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, runtread_read_posted, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) -#define MSG_ARGS \ +#define RDM_MSG_ARGS \ size_t, msg_ctx, \ size_t, addr -#define MSG_FIELDS \ +#define RDM_MSG_FIELDS \ lttng_ust_field_integer_hex(size_t, msg_ctx, msg_ctx) \ lttng_ust_field_integer_hex(size_t, addr, addr) LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_RDM_TP_PROV, msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS), - LTTNG_UST_TP_FIELDS(MSG_FIELDS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS), + LTTNG_UST_TP_FIELDS(RDM_MSG_FIELDS)) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, send_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, send_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, recv_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, recv_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, + read_begin_msg_context, + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, read_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, + write_begin_msg_context, + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, write_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + #define CQ_ENTRY_ARGS \ int, tag, \ size_t, addr @@ -99,6 +109,11 @@ LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_RDM_TP_PROV, x_entry_cq_entry, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS), LTTNG_UST_TP_FIELDS(X_ENTRY_FIELDS CQ_ENTRY_FIELDS)) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry_cq_entry, EFA_RDM_TP_PROV, + poll_cq_ope, + LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, poll_cq_ope, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry_cq_entry, EFA_RDM_TP_PROV, send_end, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS)) diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 868509162b5..c9d65061e1b 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -97,54 +97,74 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) } } +/** + * @brief Construct a message that contains the local and peer information, + * including the efa address and the host id. + * + * @param ep EFA RDM endpoint + * @param addr Remote peer fi_addr_t + * @param msg the ptr of the msg to be constructed (needs to be allocated already!) + * @param base_msg ptr to the base msg that will show at the beginning of msg + * @param msg_len the length of the message + * @return int 0 on success, negative integer on failure + */ +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + size_t len = 0; + int ret; + struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); + + len = sizeof(ep_addr_str); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, addr, peer_addr_str, &len); + + if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { + strcpy(peer_host_id_str, "N/A"); + } + + ret = snprintf(msg, msg_len, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > msg_len - 1) { + return -FI_EINVAL; + } + + if (strlen(msg) >= msg_len) { + return -FI_ENOBUFS; + } + + return FI_SUCCESS; +} + /** * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint * @param[in] addr Remote peer fi_addr_t - * @param[in] err FI_* error code(must be positive) * @param[in] prov_errno EFA provider * error code(must be positive) - * @param[out] buf Pointer to the address of error data written by this function + * @param[out] err_msg Pointer to the address of error message written by this function * @param[out] buflen Pointer to the returned error data size * @return A status code. 0 if the error data was written successfully, otherwise a negative FI error code. */ -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int err, int prov_errno, void **buf, size_t *buflen) +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen) { - char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; - char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - const char *base_msg = efa_strerror(prov_errno); - size_t len = 0; - struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); - - *buf = NULL; - *buflen = 0; - - len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); - len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); - - if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { - strcpy(local_host_id_str, "N/A"); - } - - if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { - strcpy(peer_host_id_str, "N/A"); - } - - int ret = snprintf(ep->err_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", - base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + const char *base_msg = efa_strerror(prov_errno); + int ret; - if (ret < 0 || ret > EFA_RDM_ERROR_MSG_BUFFER_LENGTH - 1) { - return -FI_EINVAL; - } + *buflen = 0; - if (strlen(ep->err_msg) >= EFA_RDM_ERROR_MSG_BUFFER_LENGTH) { - return -FI_ENOBUFS; - } + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, err_msg, base_msg, EFA_ERROR_MSG_BUFFER_LENGTH); + if (ret) + return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buflen = EFA_ERROR_MSG_BUFFER_LENGTH; - return 0; + return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index 1b2fc1da0a2..123fda9c59f 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -10,7 +10,7 @@ #define EFA_RDM_MSG_PREFIX_SIZE (sizeof(struct efa_rdm_pke) + sizeof(struct efa_rdm_eager_msgrtm_hdr) + EFA_RDM_REQ_OPT_RAW_ADDR_HDR_SIZE) -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) static_assert(EFA_RDM_MSG_PREFIX_SIZE % 8 == 0, "message prefix size alignment check"); #endif @@ -19,7 +19,9 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int err, int prov_errno, void **buf, size_t *buflen); +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); + +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen); #ifdef ENABLE_EFA_POISONING static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) @@ -30,10 +32,5 @@ static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) } #endif -static inline -bool efa_rdm_use_unsolicited_write_recv() -{ - return efa_env.use_unsolicited_write_recv && efa_device_support_unsolicited_write_recv(); -} #endif /* _EFA_RDM_UTIL_H */ diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 9ca730d0b6e..dd6f813a059 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -19,7 +19,7 @@ void test_av_insert_duplicate_raw_addr(struct efa_resource **state) fi_addr_t addr1, addr2; int err, num_addr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_check_mock; err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -54,7 +54,7 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) fi_addr_t addr1, addr2; int err, num_addr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_check_mock; err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); diff --git a/prov/efa/test/efa_unit_test_cntr.c b/prov/efa/test/efa_unit_test_cntr.c index aeb44d51195..d9d4852d2f2 100644 --- a/prov/efa/test/efa_unit_test_cntr.c +++ b/prov/efa/test/efa_unit_test_cntr.c @@ -10,7 +10,7 @@ * @return int the length of the ibv_cq_poll_list */ static -int test_efa_rdm_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) +int test_efa_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) { int i = 0; struct dlist_entry *item; @@ -30,14 +30,12 @@ int test_efa_rdm_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) * * @param state struct efa_resource that is managed by the framework */ -void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +static +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(struct efa_resource *resource) { - struct efa_resource *resource = *state; struct fid_cntr *cntr; struct fi_cntr_attr cntr_attr = {0}; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); - assert_int_equal(fi_cntr_open(resource->domain, &cntr_attr, &cntr, NULL), 0); /* TODO: expand this test to all flags */ @@ -46,7 +44,7 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resou assert_int_equal(fi_enable(resource->ep), 0); /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ - assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 1); + assert_int_equal(test_efa_cntr_get_ibv_cq_poll_list_length(cntr), 1); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); @@ -55,21 +53,35 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resou fi_close(&cntr->fid); } +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(resource); +} + /** * @brief Check the length of ibv_cq_poll_list in cntr when separate tx/rx cq is bind to 1 ep. * * @param state struct efa_resource that is managed by the framework */ -void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +static +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(struct efa_resource *resource) { - struct efa_resource *resource = *state; struct fid_cq *txcq, *rxcq; struct fi_cq_attr cq_attr = {0}; struct fid_cntr *cntr; struct fi_cntr_attr cntr_attr = {0}; - efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); - assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); assert_int_equal(fi_ep_bind(resource->ep, &txcq->fid, FI_SEND), 0); @@ -85,7 +97,7 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_r assert_int_equal(fi_enable(resource->ep), 0); - assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 2); + assert_int_equal(test_efa_cntr_get_ibv_cq_poll_list_length(cntr), 2); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); @@ -95,7 +107,23 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_r fi_close(&cntr->fid); } -void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_post_initial_rx_pkts(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_rdm_ep *efa_rdm_ep; @@ -104,7 +132,7 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) struct efa_cntr *efa_cntr; uint64_t cnt; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* At this time, rx pkts are not growed and posted */ @@ -121,7 +149,8 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) efa_cntr = container_of(cntr, struct efa_cntr, util_cntr.cntr_fid); - assert_false(efa_cntr->initial_rx_to_all_eps_posted); + /* cntr read need to scan the ep list since a ep is bind */ + assert_true(efa_cntr->need_to_scan_ep_list); cnt = fi_cntr_read(cntr); /* No completion should be read */ @@ -132,7 +161,8 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_true(efa_cntr->initial_rx_to_all_eps_posted); + /* scan is done */ + assert_false(efa_cntr->need_to_scan_ep_list); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); resource->ep = NULL; diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 930a686aa5c..13bb1882465 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -2,6 +2,7 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" +#include "efa_cq.h" #include "efa_rdm_pke_utils.h" #include "efa_rdm_pke_nonreq.h" #include "efa_rdm_pke_req.h" @@ -35,7 +36,23 @@ void efa_unit_test_construct_tmsg(struct fi_msg_tagged *tmsg, struct iovec *iov, tmsg->ignore = ignore; } -struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) +void efa_unit_test_construct_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, + void **desc, size_t iov_count, + fi_addr_t addr, struct fi_rma_iov *rma_iov, + size_t rma_iov_count, void *context, + uint64_t data) +{ + msg->msg_iov = iov; + msg->desc = desc; + msg->iov_count = iov_count; + msg->addr = addr; + msg->rma_iov = rma_iov; + msg->rma_iov_count = rma_iov_count; + msg->context = context; + msg->data = data; +} + +struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type, char *prov_name) { struct fi_info *hints; @@ -43,10 +60,11 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) if (!hints) return NULL; - hints->fabric_attr->prov_name = strdup("efa"); + hints->fabric_attr->prov_name = strdup(prov_name); hints->ep_attr->type = ep_type; - hints->domain_attr->mr_mode |= FI_MR_LOCAL | FI_MR_ALLOCATED; + /* Use a minimal caps that efa / efa-direct should always support */ + hints->domain_attr->mr_mode = MR_MODE_BITS; if (ep_type == FI_EP_DGRAM) { hints->mode |= FI_MSG_PREFIX; } @@ -54,15 +72,17 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) return hints; } +/* TODO: remove use_efa_direct after we have efa_direct implemented in fi_info */ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, - bool enable_ep, bool open_cq) + bool enable_ep, bool open_cq, char* prov_name) { int ret = 0; struct fi_av_attr av_attr = {0}; struct fi_cq_attr cq_attr = {0}; struct fi_eq_attr eq_attr = {0}; + struct efa_domain *efa_domain; ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &resource->info); if (ret) @@ -76,6 +96,17 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, if (ret) goto err; + /* + * TODO: Remove this function pointer override when we have it assigned + * for efa-direct correctly. + */ + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) { + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + + efa_domain->util_domain.domain_fid.ops->endpoint = efa_ep_open; + efa_domain->util_domain.domain_fid.ops->cq_open = efa_cq_open; + } + ret = fi_endpoint(resource->domain, resource->info, &resource->ep, NULL); if (ret) goto err; @@ -115,13 +146,19 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, assert_int_equal(ret, 0); } -void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type) +void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, true, true); + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, true, true, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, true, true, prov_name); return; err: @@ -132,13 +169,19 @@ void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_ } void efa_unit_test_resource_construct_ep_not_enabled(struct efa_resource *resource, - enum fi_ep_type ep_type) + enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, false, true); + + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, false, true, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, false, true, prov_name); return; err: @@ -149,13 +192,19 @@ void efa_unit_test_resource_construct_ep_not_enabled(struct efa_resource *resour } void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(struct efa_resource *resource, - enum fi_ep_type ep_type) + enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, false, false); + + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, false, false, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, false, false, prov_name); return; err: @@ -165,6 +214,39 @@ void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(struct efa_resour fail(); } +/** + * @brief Construct RDM ep type resources with shm disabled + */ +void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *resource) +{ + int ret; + bool shm_permitted = false; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + if (!resource->hints) + goto err; + + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), + resource->hints, false, true, EFA_PROV_NAME); + + ret = fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_SHARED_MEMORY_PERMITTED, &shm_permitted, + sizeof(shm_permitted)); + if (ret) + goto err; + + ret = fi_enable(resource->ep); + if (ret) + goto err; + + return; +err: + efa_unit_test_resource_destruct(resource); + + /* Fail test early if the resource struct fails to initialize */ + fail(); +} + /** * @brief Clean up test resources. * Note: Resources should be destroyed in order. diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index eb8ebe1ae5a..82dcd38952f 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -2,9 +2,8 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" -#include "dgram/efa_dgram_ep.h" -#include "dgram/efa_dgram_cq.h" #include "rdm/efa_rdm_cq.h" +#include "efa_av.h" /** * @brief implementation of test cases for fi_cq_read() works with empty device CQ for given endpoint type @@ -20,22 +19,12 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e struct ibv_cq_ex *ibv_cqx; struct fi_cq_data_entry cq_entry; int ret; + struct efa_base_ep *efa_base_ep; - efa_unit_test_resource_construct(resource, ep_type); - - if (ep_type == FI_EP_DGRAM) { - struct efa_dgram_ep *efa_dgram_ep; - - efa_dgram_ep = container_of(resource->ep, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - ibv_cqx = efa_dgram_ep->rcq->ibv_cq_ex; - } else { - struct efa_rdm_ep *efa_rdm_ep; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert(efa_rdm_ep->base_ep.util_ep.rx_cq); - ibv_cqx = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq)->ibv_cq.ibv_cq_ex; - } + efa_unit_test_resource_construct(resource, ep_type, EFA_PROV_NAME); + efa_base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_cqx = container_of(efa_base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; /* ibv_start_poll to return ENOENT means device CQ is empty */ @@ -102,21 +91,16 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, struct efa_rdm_peer *peer; struct efa_rdm_cq *efa_rdm_cq; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->host_id = local_host_id; ibv_qpx = efa_rdm_ep->base_ep.qp->ibv_qp_ex; - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); - ibv_cqx = efa_rdm_cq->ibv_cq.ibv_cq_ex; - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); @@ -153,14 +137,14 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); will_return(efa_mock_ibv_read_vendor_err_return_mock, vendor_error); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); ret = fi_cq_read(resource->cq, &cq_entry, 1); /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); assert_int_equal(ret, -FI_EAVAIL); /* Allocate memory to read CQ error */ - cq_err_entry.err_data_size = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH; cq_err_entry.err_data = malloc(cq_err_entry.err_data_size); assert_non_null(cq_err_entry.err_data); @@ -232,6 +216,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } +/** + * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns + * unreachable remote error for send. + * + * When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + test_rdm_cq_read_bad_send_status(resource, + 0x1234567812345678, 0x8765432187654321, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE); +} + /** * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns * invalid qpn error for send. @@ -284,9 +285,10 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) struct fi_eq_err_entry eq_err_entry; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* @@ -302,13 +304,14 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) assert_non_null(pkt_entry); efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, 0); will_return(efa_mock_ibv_end_poll_check_mock, NULL); @@ -317,13 +320,21 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) * therefore use will_return_always() */ will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return_always(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); /* the recv error will not populate to application cq because it's an EFA internal error and * and not related to any application recv. Currently we can only read the error from eq. */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->status = IBV_WC_GENERAL_ERR; + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (efa_use_unsolicited_write_recv()) { + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return(efa_mock_efadv_wc_is_unsolicited, false); + } +#endif + ret = fi_cq_read(resource->cq, &cq_entry, 1); assert_int_equal(ret, -FI_EAGAIN); @@ -333,6 +344,101 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } +/** + * @brief verify that fi_cq_read/fi_eq_read works properly when rdma-core return bad status for + * recv rdma with imm. + * + * When getting a wc error of op code IBV_WC_RECV_RDMA_WITH_IMM, libfabric cannot find the + * corresponding application operation to write a cq error. + * It will write an EQ error instead. + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] use_unsolicited_recv whether to use unsolicited write recv + */ +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource **state, bool use_unsolicited_recv) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + struct fi_cq_data_entry cq_entry; + struct fi_eq_err_entry eq_err_entry; + int ret; + struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; + + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; + + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + + will_return(efa_mock_ibv_start_poll_return_mock, 0); + will_return(efa_mock_ibv_end_poll_check_mock, NULL); + /* efa_mock_ibv_read_opcode_return_mock() will be called once in release mode, + * but will be called twice in debug mode. because there is an assertion that called ibv_read_opcode(), + * therefore use will_return_always() + */ + will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV_RDMA_WITH_IMM); + will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); + will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_FLUSHED); + + g_efa_unit_test_mocks.efa_device_support_unsolicited_write_recv = &efa_mock_efa_device_support_unsolicited_write_recv; + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (use_unsolicited_recv) { + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return(efa_mock_efa_device_support_unsolicited_write_recv, true); + will_return(efa_mock_efadv_wc_is_unsolicited, true); + ibv_cqx->wr_id = 0; + } else { + /* + * For solicited write recv, it will consume an internal rx pkt + */ + will_return(efa_mock_efa_device_support_unsolicited_write_recv, false); + struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + ibv_cqx->wr_id = (uintptr_t)pkt_entry; + } +#else + /* + * Always test with solicited recv + */ + will_return(efa_mock_efa_device_support_unsolicited_write_recv, false); + struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + ibv_cqx->wr_id = (uintptr_t)pkt_entry; +#endif + /* the recv rdma with imm will not populate to application cq because it's an EFA internal error and + * and not related to any application operations. Currently we can only read the error from eq. + */ + ibv_cqx->status = IBV_WC_GENERAL_ERR; + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, -FI_EAGAIN); + + ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0); + assert_int_equal(ret, sizeof(eq_err_entry)); + assert_int_not_equal(eq_err_entry.err, FI_SUCCESS); + assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_FLUSHED); +} + +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv(struct efa_resource **state) +{ + test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(state, true); +} + +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv(struct efa_resource **state) +{ + test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(state, false); +} + /** * @brief verify that fi_cq_read/fi_cq_readerr works properly when ibv_start_poll failed. * @@ -347,13 +453,16 @@ void test_ibv_cq_ex_read_failed_poll(struct efa_resource **state) struct fi_cq_err_entry cq_err_entry; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; + + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, EFAULT); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); @@ -389,7 +498,7 @@ void test_rdm_cq_create_error_handling(struct efa_resource **state) } efa_device_construct(&efa_device, 0, ibv_device_list[0]); - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); assert_int_equal(fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info), 0); assert_int_equal(fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL), 0); @@ -423,7 +532,7 @@ int test_efa_rdm_cq_get_ibv_cq_poll_list_length(struct fid_cq *cq_fid) { struct efa_rdm_cq *cq; - cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(cq_fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); return efa_unit_test_get_dlist_length(&cq->ibv_cq_poll_list); } @@ -437,7 +546,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resourc { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ assert_int_equal(test_efa_rdm_cq_get_ibv_cq_poll_list_length(resource->cq), 1); @@ -454,7 +563,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_res struct fid_cq *txcq, *rxcq; struct fi_cq_attr cq_attr = {0}; - efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); @@ -483,16 +592,17 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_cq *efa_rdm_cq; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); /* At this time, rx pkts are not growed and posted */ assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_false(efa_rdm_cq->initial_rx_to_all_eps_posted); + /* cq read need to scan the ep list since a ep is bind */ + assert_true(efa_rdm_cq->need_to_scan_ep_list); fi_cq_read(resource->cq, NULL, 0); /* At this time, rx pool size number of rx pkts are posted */ @@ -500,7 +610,8 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_true(efa_rdm_cq->initial_rx_to_all_eps_posted); + /* scan is done */ + assert_false(efa_rdm_cq->need_to_scan_ep_list); } #if HAVE_EFADV_CQ_EX /** @@ -526,6 +637,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc struct efa_unit_test_buff recv_buff; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; /* * Always use mocked efadv_create_cq instead of the real one. @@ -541,10 +653,11 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc expect_function_call(efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null); } - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; /* Construct a minimal recv buffer */ efa_unit_test_buff_construct(&recv_buff, resource, efa_rdm_ep->min_multi_recv_size); @@ -583,19 +696,19 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc efa_unit_test_eager_msgrtm_pkt_construct(pkt_entry, &pkt_attr); /* Setup CQ */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; if (support_efadv_cq) { - efadv_cq = efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex); + efadv_cq = efadv_cq_from_ibv_cq_ex(ibv_cqx); assert_non_null(efadv_cq); efadv_cq->wc_read_sgid = &efa_mock_efadv_wc_read_sgid_return_zero_code_and_expect_next_poll_and_set_gid; @@ -612,7 +725,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc will_return(efa_mock_ibv_read_slid_return_mock, 0xffff); // slid=0xffff(-1) indicates an unknown AH will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return_maybe(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0); will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); @@ -696,3 +809,264 @@ void test_ibv_cq_ex_read_ignore_removed_peer() skip(); } #endif + +static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, + int ibv_wc_opcode, int status, int vendor_error, + struct efa_context *ctx) +{ + int ret; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_ep_addr raw_addr; + struct ibv_cq_ex *ibv_cqx; + struct ibv_qp_ex *ibv_qpx; + struct efa_base_ep *base_ep; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qpx = base_ep->qp->ibv_qp_ex; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, NULL /* context */); + assert_int_equal(ret, 1); + + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_send = &efa_mock_ibv_wr_send_save_wr; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; + + base_ep->qp->ibv_qp->context->ops.post_recv = &efa_mock_ibv_post_recv; + will_return_maybe(efa_mock_ibv_post_recv, 0); + + if (ibv_wc_opcode == IBV_WC_RECV) { + ibv_cqx = container_of(base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ctx->completion_flags = FI_RECV | FI_MSG; + will_return(efa_mock_ibv_start_poll_return_mock, 0); + ibv_cqx->status = status; + } else { + ibv_cqx = container_of(base_ep->util_ep.tx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; + /* this mock will set ibv_cq_ex->wr_id to the wr_id of the head of global send_wr, + * and set ibv_cq_ex->status to mock value */ + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status; + ctx->completion_flags = FI_SEND | FI_MSG; + will_return(efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status, status); + } + ctx->addr = *addr; + ibv_cqx->wr_id = (uintptr_t) ctx; + + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + will_return_maybe(efa_mock_ibv_end_poll_check_mock, NULL); + will_return_maybe(efa_mock_ibv_next_poll_return_mock, 0); + will_return_maybe(efa_mock_ibv_read_opcode_return_mock, ibv_wc_opcode); + will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, base_ep->qp->qp_num); + will_return_maybe(efa_mock_ibv_read_vendor_err_return_mock, vendor_error); +#if HAVE_EFADV_CQ_EX + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + will_return_maybe(efa_mock_ibv_read_byte_len_return_mock, 4096); + will_return_maybe(efa_mock_ibv_read_slid_return_mock, efa_av_addr_to_conn(base_ep->av, *addr)->ah->ahn); + will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); + will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0); +#endif +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (efa_use_unsolicited_write_recv()) { + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return_maybe(efa_mock_efadv_wc_is_unsolicited, false); + } +#endif +} + +/** + * @brief test EFA CQ's fi_cq_read() works properly when rdma-core return + * success status for send operation. + */ +void test_efa_cq_read_send_success(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; + struct fi_context2 ctx; + struct fi_cq_data_entry cq_entry; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_SUCCESS, 0, + (struct efa_context *) &ctx); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), addr, &ctx); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->qp->ibv_qp_ex->wr_id; + assert_true(efa_context->completion_flags & FI_SEND); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + assert_int_equal(ret, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief test EFA CQ's fi_cq_read() works properly when rdma-core return + * success status for recv operation. + */ +void test_efa_cq_read_recv_success(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; + struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_SUCCESS, 0, + (struct efa_context *) &ctx); + efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); + + ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, + fi_mr_desc(recv_buff.mr), addr, &ctx); + assert_int_equal(ret, 0); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->efa_recv_wr_vec[base_ep->recv_wr_index].wr.wr_id; + assert_true(efa_context->completion_flags & FI_RECV); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, 1); + + efa_unit_test_buff_destruct(&recv_buff); +} + +static void efa_cq_check_cq_err_entry(struct efa_resource *resource, int vendor_error) { + struct fi_cq_err_entry cq_err_entry = {0}; + const char *strerror; + int ret; + + /* Allocate memory to read CQ error */ + cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH; + cq_err_entry.err_data = malloc(cq_err_entry.err_data_size); + assert_non_null(cq_err_entry.err_data); + + ret = fi_cq_readerr(resource->cq, &cq_err_entry, 0); + assert_true(cq_err_entry.err_data_size > 0); + strerror = fi_cq_strerror(resource->cq, cq_err_entry.prov_errno, + cq_err_entry.err_data, NULL, 0); + + assert_int_equal(ret, 1); + assert_int_not_equal(cq_err_entry.err, FI_SUCCESS); + assert_int_equal(cq_err_entry.prov_errno, vendor_error); + assert_true(strlen(strerror) > 0); +} + +/** + * @brief test EFA CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core return bad status for send. + * + * When the send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_efa_cq_read_send_failure(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; + struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_GENERAL_ERR, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE, (struct efa_context *) &ctx); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), addr, &ctx); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->qp->ibv_qp_ex->wr_id; + assert_true(efa_context->completion_flags & FI_SEND); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + assert_int_equal(ret, -FI_EAVAIL); + + efa_cq_check_cq_err_entry(resource, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief test EFA CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core return bad status for recv. + * + * When the recv operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_efa_cq_read_recv_failure(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; + struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_GENERAL_ERR, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE, (struct efa_context *) &ctx); + efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); + + ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, + fi_mr_desc(recv_buff.mr), addr, &ctx); + assert_int_equal(ret, 0); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->efa_recv_wr_vec[base_ep->recv_wr_index].wr.wr_id; + assert_true(efa_context->completion_flags & FI_RECV); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, -FI_EAVAIL); + + efa_cq_check_cq_err_entry(resource, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + + efa_unit_test_buff_destruct(&recv_buff); +} diff --git a/prov/efa/test/efa_unit_test_domain.c b/prov/efa/test/efa_unit_test_domain.c index ccfa1c53149..29a21d29fb9 100644 --- a/prov/efa/test/efa_unit_test_domain.c +++ b/prov/efa/test/efa_unit_test_domain.c @@ -10,7 +10,7 @@ void test_efa_domain_open_ops_wrong_name(struct efa_resource **state) int ret; struct fi_efa_ops_domain *efa_domain_ops; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_open_ops(&resource->domain->fid, "arbitrary name", 0, (void **)&efa_domain_ops, NULL); assert_int_equal(ret, -FI_EINVAL); @@ -61,7 +61,7 @@ void test_efa_domain_open_ops_mr_query(struct efa_resource **state) { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* set recv_ic_id as 0 */ g_efa_unit_test_mocks.efadv_query_mr = &efa_mock_efadv_query_mr_recv_ic_id_0; @@ -114,7 +114,7 @@ void test_efa_domain_open_ops_mr_query(struct efa_resource **state) { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); test_efa_domain_open_ops_mr_query_common( resource, diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index de7454567c1..1c12b5913dd 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -37,7 +37,7 @@ void test_efa_rdm_ep_host_id(struct efa_resource **state, bool file_exists, char efa_env.host_id_file = host_id_file; } - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -109,27 +109,22 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_pke *pkt_entry; uint64_t actual_peer_host_id = UINT64_MAX; - int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; g_efa_unit_test_mocks.local_host_id = local_host_id; g_efa_unit_test_mocks.peer_host_id = peer_host_id; assert_false(actual_peer_host_id == g_efa_unit_test_mocks.peer_host_id); - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; efa_rdm_ep->host_id = g_efa_unit_test_mocks.local_host_id; - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(ret, 0); - efa_rdm_ep->shm_ep = NULL; - } - /* Create and register a fake peer */ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); @@ -173,18 +168,18 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin expect_function_call(efa_mock_ibv_wr_send_verify_handshake_pkt_local_host_id_and_save_wr); /* Setup CQ */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_SUCCESS; - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->status = IBV_WC_SUCCESS; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; expect_function_call(efa_mock_ibv_next_poll_check_function_called_and_return_mock); /* Receive handshake packet */ @@ -192,7 +187,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin will_return(efa_mock_ibv_next_poll_check_function_called_and_return_mock, ENOENT); will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_wc_flags_return_mock, 0); will_return(efa_mock_ibv_read_slid_return_mock, efa_rdm_ep_get_peer_ahn(efa_rdm_ep, peer_addr)); will_return(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); @@ -204,7 +199,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin */ will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_vendor_err_return_mock, FI_EFA_ERR_OTHER); will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS); @@ -217,8 +212,8 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin * We need to poll the CQ twice explicitly to point the CQE * to the saved send wr in handshake */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; + ibv_cqx->status = IBV_WC_GENERAL_ERR; + ibv_cqx->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; /* Progress the send wr to clean up outstanding tx ops */ cq_read_send_ret = fi_cq_read(resource->cq, &cq_entry, 1); @@ -277,7 +272,7 @@ void test_efa_rdm_ep_pkt_pool_flags(struct efa_resource **state) { struct efa_resource *resource = *state; efa_env.huge_page_setting = EFA_ENV_HUGE_PAGE_DISABLED; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); check_ep_pkt_pool_flags(resource->ep, OFI_BUFPOOL_NONSHARED); } @@ -295,7 +290,7 @@ void test_efa_rdm_ep_pkt_pool_page_alignment(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_env.huge_page_setting = EFA_ENV_HUGE_PAGE_DISABLED; ret = fi_endpoint(resource->domain, resource->info, &ep, NULL); @@ -326,7 +321,7 @@ void test_efa_rdm_read_copy_pkt_pool_128_alignment(struct efa_resource **state) struct efa_resource *resource = *state; struct efa_domain *efa_domain = NULL; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* rx_readcopy_pkt_pool is only created when application requested FI_HMEM */ efa_domain = container_of(resource->domain, struct efa_domain, @@ -363,14 +358,14 @@ void test_efa_rdm_pke_get_available_copy_methods_align128(struct efa_resource ** struct efa_resource *resource = *state; bool local_read_available, gdrcopy_available, cuda_memcpy_available; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_mr.peer.iface = FI_HMEM_CUDA; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; /* p2p is available */ - efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true; efa_rdm_ep->hmem_p2p_opt = FI_HMEM_P2P_ENABLED; /* RDMA read is supported */ @@ -411,7 +406,8 @@ void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **stat int buf[1] = {0}, err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -434,12 +430,7 @@ void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **stat msg.op = FI_SUM; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -480,7 +471,8 @@ void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) int err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -498,12 +490,7 @@ void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) msg.desc = NULL; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -545,7 +532,8 @@ void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource ** int err, numaddr; int i; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -563,12 +551,7 @@ void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource ** msg.desc = NULL; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -610,11 +593,11 @@ void test_efa_rdm_ep_rma_queue_before_handshake(struct efa_resource **state, int struct efa_rdm_ope *txe; struct efa_rdm_peer *peer; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -678,6 +661,79 @@ void test_efa_rdm_ep_read_queue_before_handshake(struct efa_resource **state) test_efa_rdm_ep_rma_queue_before_handshake(state, ofi_op_read_req); } +/** + * @brief When local support unsolicited write, but the peer doesn't, fi_writedata + * (use rdma-write with imm) should fail as FI_EINVAL + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t peer_addr; + int num_addr; + const int buf_len = 8; + char buf[8] = {0}; + int err; + uint64_t rma_addr, rma_key; + struct efa_rdm_peer *peer; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; + resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 22), + resource->hints, true, true, EFA_PROV_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /** + * TODO: It's better to mock this function + * so we can test on platform that doesn't + * support rdma-write. + */ + if (!(efa_rdm_ep_support_rdma_write(efa_rdm_ep))) + skip(); + + /* Make local ep support unsolicited write recv */ + efa_rdm_ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + + /* create a fake peer */ + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* create a fake rma_key and address. fi_read should return before + * they are needed. */ + rma_key = 0x1234; + rma_addr = (uint64_t) &buf; + + /* + * Fake a peer that has made handshake and + * does not support unsolicited write recv + */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_WRITE; + peer->extra_info[0] &= ~EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + /* make sure shm is not used */ + peer->is_local = false; + + err = fi_writedata(resource->ep, buf, buf_len, + NULL, /* desc, not required */ + 0x1234, + peer_addr, + rma_addr, + rma_key, + NULL); /* context */ + assert_int_equal(err, -FI_EOPNOTSUPP); +} + /** * @brief verify that when shm was used to send a small message (<4k), no copy was performed. * @@ -694,7 +750,7 @@ void test_efa_rdm_ep_send_with_shm_no_copy(struct efa_resource **state) char buff[8] = {0}; int err; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -733,16 +789,16 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state) int err; uint64_t rma_addr, rma_key; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_RMA; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal( efa_rdm_ep->user_info->caps & FI_RMA, 0); + assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_RMA, 0); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -784,16 +840,16 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state) int err; uint64_t rma_addr, rma_key; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_ATOMIC; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have ATOMIC capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal( efa_rdm_ep->user_info->caps & FI_ATOMIC, 0); + assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_ATOMIC, 0); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -836,13 +892,10 @@ void test_efa_rdm_ep_getopt(struct efa_resource **state, size_t opt_len, int exp FI_OPT_EFA_EMULATED_READ, FI_OPT_EFA_EMULATED_WRITE, FI_OPT_EFA_EMULATED_ATOMICS, - FI_OPT_EFA_USE_DEVICE_RDMA, - FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, - FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES }; size_t num_opt_names = sizeof(opt_names) / sizeof(int); - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); for (i = 0; i < num_opt_names; i++) { opt_len_temp = opt_len; @@ -867,20 +920,13 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_rdm_ep *ep; - bool optval = false; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, - FI_OPT_SHARED_MEMORY_PERMITTED, &optval, - sizeof(optval)), - 0); - - assert_int_equal(fi_enable(resource->ep), 0); - assert_null(ep->shm_ep); } @@ -895,7 +941,7 @@ void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(struct efa_reso { struct efa_resource *resource = *state; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); /* fi_setopt should always succeed */ assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, @@ -947,37 +993,36 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, bool cuda_p2p_supported, bool expected_use_zcpy_rx) { - struct efa_domain *efa_domain; struct efa_rdm_ep *ep; size_t max_msg_size = 1000; - size_t inject_size = 0; + size_t inject_msg_size = 0; + size_t inject_rma_size = 0; bool shm_permitted = false; + ofi_hmem_disable_p2p = cuda_p2p_disabled; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, false, true); - - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); + resource->hints, false, true, EFA_PROV_NAME); /* System memory P2P should always be enabled */ - assert_true(efa_domain->hmem_info[FI_HMEM_SYSTEM].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_SYSTEM].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_SYSTEM].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_SYSTEM].initialized); + assert_true(g_efa_hmem_info[FI_HMEM_SYSTEM].p2p_supported_by_device); /** * We want to be able to run this test on any platform: * 1. Fake CUDA support. * 2. Disable all other hmem ifaces. */ - efa_domain->hmem_info[FI_HMEM_CUDA].initialized = true; - efa_domain->hmem_info[FI_HMEM_CUDA].p2p_disabled_by_user = cuda_p2p_disabled; - efa_domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = cuda_p2p_supported; + g_efa_hmem_info[FI_HMEM_CUDA].initialized = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = cuda_p2p_supported; - efa_domain->hmem_info[FI_HMEM_NEURON].initialized = false; - efa_domain->hmem_info[FI_HMEM_SYNAPSEAI].initialized = false; + g_efa_hmem_info[FI_HMEM_NEURON].initialized = false; + g_efa_hmem_info[FI_HMEM_SYNAPSEAI].initialized = false; ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + if (cuda_p2p_supported) + ep->hmem_p2p_opt = FI_HMEM_P2P_ENABLED; + /* Set sufficiently small max_msg_size */ assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &max_msg_size, sizeof max_msg_size), 0); @@ -986,19 +1031,30 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_SHARED_MEMORY_PERMITTED, &shm_permitted, sizeof shm_permitted), 0); - assert_true(ep->max_msg_size == max_msg_size); + assert_true(ep->base_ep.max_msg_size == max_msg_size); /* Enable EP */ assert_int_equal(fi_enable(resource->ep), 0); assert_true(ep->use_zcpy_rx == expected_use_zcpy_rx); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_msg_size, &(size_t){sizeof inject_msg_size}), 0); + assert_int_equal(ep->base_ep.inject_msg_size, inject_msg_size); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, - &inject_size, &(size_t){sizeof inject_size}), 0); - assert_int_equal(ep->inject_size, inject_size); - if (expected_use_zcpy_rx) - assert_int_equal(inject_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); - else - assert_int_equal(inject_size, resource->info->tx_attr->inject_size); + &inject_rma_size, &(size_t){sizeof inject_rma_size}), 0); + assert_int_equal(ep->base_ep.inject_rma_size, inject_rma_size); + + if (expected_use_zcpy_rx) { + assert_int_equal(inject_msg_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + assert_int_equal(inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + } else { + assert_int_equal(inject_msg_size, resource->info->tx_attr->inject_size); + assert_int_equal(inject_rma_size, resource->info->tx_attr->inject_size); + } + /* restore global variable */ + ofi_hmem_disable_p2p = 0; } /** @@ -1012,7 +1068,7 @@ void test_efa_rdm_ep_user_zcpy_rx_disabled(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; @@ -1022,19 +1078,19 @@ void test_efa_rdm_ep_user_zcpy_rx_disabled(struct efa_resource **state) } /** - * @brief Verify zcpy_rx is enabled if CUDA P2P is explictly disabled + * @brief Verify zcpy_rx is disabled if CUDA P2P is explictly disabled */ -void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy(struct efa_resource **state) +void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; resource->hints->caps = FI_MSG; - test_efa_rdm_ep_use_zcpy_rx_impl(resource, true, false, true); + test_efa_rdm_ep_use_zcpy_rx_impl(resource, true, false, false); } /** @@ -1044,7 +1100,7 @@ void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(struct efa_resource **state { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -1062,7 +1118,7 @@ void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(struct efa_resource ** { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; @@ -1078,7 +1134,7 @@ void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(struct efa_resource { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_MSG; @@ -1092,7 +1148,7 @@ void test_efa_rdm_ep_close_discard_posted_recv(struct efa_resource **state) struct efa_resource *resource = *state; char buf[16]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* Post recv and then close ep */ assert_int_equal(fi_recv(resource->ep, (void *) buf, 16, NULL, FI_ADDR_UNSPEC, NULL), 0); @@ -1112,7 +1168,7 @@ void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) struct fi_context cancel_context = {0}; struct efa_unit_test_buff recv_buff; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_MSG; @@ -1135,6 +1191,46 @@ void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) free(recv_buff.buff); } +/** + * @brief When user posts more than rx size fi_recv, we should return eagain and make sure + * there is no rx entry leaked + */ +void test_efa_rdm_ep_zcpy_recv_eagain(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + int i; + struct efa_rdm_ep *efa_rdm_ep; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + assert_non_null(resource->hints); + + resource->hints->caps = FI_MSG; + + /* enable zero-copy recv mode in ep */ + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Construct a recv buffer with mr */ + efa_unit_test_buff_construct(&recv_buff, resource, 16); + + for (i = 0; i < efa_rdm_ep->base_ep.info->rx_attr->size; i++) + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), 0); + + /* we should have rx number of rx entry before and after the extra recv post */ + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), -FI_EAGAIN); + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + + /** + * the buf is still posted to rdma-core, so unregistering mr can + * return non-zero. Currently ignore this failure. + */ + (void) fi_close(&recv_buff.mr->fid); + free(recv_buff.buff); +} + /** * @brief when efa_rdm_ep_post_handshake_error failed due to pkt pool exhaustion, * make sure both txe is cleaned @@ -1152,8 +1248,10 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res int err, numaddr; struct efa_rdm_pke **pkt_entry_vec; int i; + size_t tx_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -1164,12 +1262,8 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res assert_int_equal(numaddr, 1); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + tx_size = efa_rdm_ep->base_ep.info->tx_attr->size; + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -1178,11 +1272,11 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res peer->flags = EFA_RDM_PEER_REQ_SENT; peer->is_local = false; - pkt_entry_vec = calloc(efa_rdm_ep->tx_size, sizeof(struct efa_rdm_pke *)); + pkt_entry_vec = calloc(tx_size, sizeof(struct efa_rdm_pke *)); assert_non_null(pkt_entry_vec); /* Exhaust the tx pkt pool */ - for (i = 0; i < efa_rdm_ep->tx_size; i++) { + for (i = 0; i < tx_size; i++) { pkt_entry_vec[i] = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL); assert_non_null(pkt_entry_vec[i]); } @@ -1192,8 +1286,369 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res assert_int_equal(efa_rdm_ep_post_handshake(efa_rdm_ep, peer), -FI_EAGAIN); assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - for (i = 0; i < efa_rdm_ep->tx_size; i++) + for (i = 0; i < tx_size; i++) efa_rdm_pke_release_tx(pkt_entry_vec[i]); free(pkt_entry_vec); } + +static +void test_efa_rdm_ep_rx_refill_impl(struct efa_resource **state, int threshold, int rx_size) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_pke *pkt_entry; + int i; + size_t threshold_orig; + + if (threshold < 4 || rx_size < 4) { + fprintf(stderr, "Too small threshold or rx_size for this test\n"); + fail(); + } + + threshold_orig = efa_env.internal_rx_refill_threshold; + + efa_env.internal_rx_refill_threshold = threshold; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + assert_non_null(resource->hints); + resource->hints->rx_attr->size = rx_size; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), + resource->hints, true, true, EFA_PROV_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + assert_int_equal(efa_rdm_ep_get_rx_pool_size(efa_rdm_ep), rx_size); + + /* Grow the rx pool and post rx pkts */ + efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, efa_rdm_ep_get_rx_pool_size(efa_rdm_ep)); + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + for (i = 0; i < 4; i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post < FI_EFA_RX_REFILL_THRESHOLD + * pkts should NOT be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size); + + /* releasing more pkts to reach the threshold or rx_size*/ + for (i = 4; i < MIN(rx_size, threshold); i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, i); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post == min(FI_EFA_RX_REFILL_THRESHOLD, FI_EFA_RX_SIZE) + * pkts should be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size + i); + + /* recover the original value */ + efa_env.internal_rx_refill_threshold = threshold_orig; +} + +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 8, 64); +} + +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 128, 64); +} + +/** + * @brief when unsolicited write recv is supported (by device + env), + * efa_rdm_ep_support_unsolicited_write_recv + * should return true, otherwise it should return false + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] is_supported support status + */ +void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(efa_use_unsolicited_write_recv(), + efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); +} + +/** + * @brief Test the default operational sizes for efa_rdm_ep + * + * @param state + */ +void test_efa_rdm_ep_default_sizes(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* sizes shared with base_ep */ + assert_int_equal(efa_rdm_ep->base_ep.max_msg_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->base_ep.max_rma_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->base_ep.inject_msg_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->base_ep.inject_rma_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->base_ep.rnr_retry, EFA_RDM_DEFAULT_RNR_RETRY); + + /* efa_rdm_ep's own fields */ + assert_int_equal(efa_rdm_ep->max_tagged_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->max_atomic_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->inject_tagged_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->inject_atomic_size, resource->info->tx_attr->inject_size); +} + +/** + * @brief Test the fi_endpoint API for efa_ep + * for rdm ep type (because the dgram ep type should + * have the same logic) + * @param state + */ +void test_efa_ep_open(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_base_ep *efa_ep; + struct efa_domain *efa_domain; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_domain = container_of(resource->domain, struct efa_domain, + util_domain.domain_fid); + + /* Check various size limits defaults */ + assert_true(efa_ep->max_msg_size == efa_domain->device->ibv_port_attr.max_msg_sz); + assert_true(efa_ep->max_rma_size == efa_domain->device->max_rdma_size); + assert_true(efa_ep->inject_msg_size == efa_domain->device->efa_attr.inline_buf_size); + /* TODO: update inject_rma_size to inline size after firmware + * supports inline rdma write */ + assert_true(efa_ep->inject_rma_size == 0); + assert_int_equal(efa_ep->rnr_retry, EFA_RNR_INFINITE_RETRY); +} + +/** + * @brief Test the fi_cancel API for efa_ep + * (for rdm ep type because dgram logic should be the same) + * It should return -FI_ENOSYS as device doesn't support it; + * @param state + */ +void test_efa_ep_cancel(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int ret; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + ret = fi_cancel((struct fid *)resource->ep, NULL); + assert_int_equal(ret, -FI_ENOSYS); +} + +/** + * @brief Test the fi_getopt API fo efa_ep + * + * @param state + */ +void test_efa_ep_getopt(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int optval_int; + bool optval_bool; + size_t optval_size_t; + size_t optlen; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + optlen = sizeof(optval_int); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_FI_HMEM_P2P, &optval_int, &optlen), 0); + assert_int_equal(optval_int, FI_HMEM_P2P_REQUIRED); + + optlen = sizeof(optval_bool); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_EMULATED_READ, &optval_bool, &optlen), 0); + assert_false(optval_bool); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_EMULATED_WRITE, &optval_bool, &optlen), 0); + assert_false(optval_bool); + + optlen = sizeof(optval_size_t); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->rnr_retry); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->max_msg_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_RMA_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->max_rma_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->inject_msg_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->inject_rma_size); +} + +/** + * @brief Test the fi_setopt API for efa_ep + * When RMA is requested, FI_OPT_EFA_USE_DEVICE_RDMA + * cannot be set as false + * @param state + */ +void test_efa_ep_setopt_use_device_rdma(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + bool optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + /* Hard code RMA caps in ep->info for local testing purpose */ + efa_ep->info->caps |= FI_RMA; + + /* Disable rdma is not allowed when user requests FI_RMA */ + optval = false; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_USE_DEVICE_RDMA, &optval, sizeof(optval)), -FI_EOPNOTSUPP); +} + +/** + * @brief Test the fi_setopt API for efa_ep + * FI_OPT_FI_HMEM_P2P cannot be set as FI_HMEM_P2P_DISABLED + * @param state + */ +void test_efa_ep_setopt_hmem_p2p(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int optval; + int optvals[] = { + FI_HMEM_P2P_DISABLED, + FI_HMEM_P2P_ENABLED, + FI_HMEM_P2P_PREFERRED, + FI_HMEM_P2P_REQUIRED, + }; + size_t num_optvals = sizeof(optvals) / sizeof(int); + int i, expected_return; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + /* FI_HMEM_P2P_DISABLED is not allowed */ + for (i = 0; i < num_optvals; i++) { + optval = optvals[i]; + expected_return = (optval == FI_HMEM_P2P_DISABLED) ? -FI_EOPNOTSUPP : FI_SUCCESS; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_FI_HMEM_P2P, &optval, sizeof(optval)), expected_return); + } +} + +/** + * @brief Test the fi_setopt API for efa_ep with FI_OPT_EFA_RNR_RETRY + * @param state + */ +void test_efa_ep_setopt_rnr_retry(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + assert_false(efa_ep->efa_qp_enabled); + + optval = 7; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval, sizeof(optval)), FI_SUCCESS); + assert_int_equal(efa_ep->rnr_retry, optval); + + /* hack qp enabled status to allow local test */ + efa_ep->efa_qp_enabled = true; + /* fi_setopt should fail when it's called after ep enable */ + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval, sizeof(optval)), -FI_EINVAL); + /* recover */ + efa_ep->efa_qp_enabled = false; +} + +/** + * @brief Test the fi_setopt API for efa_ep with FI_OPT_*_SIZE + * @param state + */ +void test_efa_ep_setopt_sizes(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + size_t size_thresholds[] = { + [FI_OPT_MAX_MSG_SIZE] = (size_t) efa_ep->domain->device->ibv_port_attr.max_msg_sz, + [FI_OPT_MAX_RMA_SIZE] = (size_t) efa_ep->domain->device->max_rdma_size, + [FI_OPT_INJECT_MSG_SIZE] = (size_t) efa_ep->domain->device->efa_attr.inline_buf_size, + [FI_OPT_INJECT_RMA_SIZE] = (size_t) 0, + }; + int optnames[] = { + FI_OPT_MAX_MSG_SIZE, + FI_OPT_MAX_RMA_SIZE, + FI_OPT_INJECT_MSG_SIZE, + FI_OPT_INJECT_RMA_SIZE, + }; + size_t num_optnames = sizeof(optnames) / sizeof(int); + int i, optname; + + for (i = 0; i < num_optnames; i++) { + optname = optnames[i]; + + /* set optval <= threshold is allowed */ + optval = 0.5 * size_thresholds[optname]; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, optname, &optval, sizeof(optval)), FI_SUCCESS); + + /* set optval > threshold is NOT allowed */ + optval = size_thresholds[optname] + 10; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, optname, &optval, sizeof(optval)), -FI_EINVAL); + } +} + +/** + * @brief Test fi_ep_bind and fi_enable API for efa_ep + * + * @param state + */ +void test_efa_ep_bind_and_enable(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + assert_true(efa_ep->efa_qp_enabled); + /* we shouldn't have user recv qp for efa-direct */ + assert_true(efa_ep->user_recv_qp == NULL); +} \ No newline at end of file diff --git a/prov/efa/test/efa_unit_test_hmem.c b/prov/efa/test/efa_unit_test_hmem.c index 55734af286a..2b278bddfba 100644 --- a/prov/efa/test/efa_unit_test_hmem.c +++ b/prov/efa/test/efa_unit_test_hmem.c @@ -7,8 +7,7 @@ #if HAVE_NEURON /** * @brief Verify when neuron_alloc failed (return null), - * efa_domain_open, which call efa_hmem_info_update_neuron - * when HAVE_NEURON=1, will still return 0 but leave + * efa_hmem_info_initialize will still return 0 but leave * efa_hmem_info[FI_HMEM_NEURON].initialized and * efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device as false. * @@ -18,36 +17,30 @@ void test_efa_hmem_info_update_neuron(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; uint32_t efa_device_caps_orig; bool neuron_initialized_orig; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - neuron_initialized_orig = hmem_ops[FI_HMEM_NEURON].initialized; hmem_ops[FI_HMEM_NEURON].initialized = true; efa_device_caps_orig = g_device_list[0].device_caps; g_device_list[0].device_caps |= EFADV_DEVICE_ATTR_CAPS_RDMA_READ; g_efa_unit_test_mocks.neuron_alloc = &efa_mock_neuron_alloc_return_null; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ hmem_ops[FI_HMEM_NEURON].initialized = neuron_initialized_orig; g_device_list[0].device_caps = efa_device_caps_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); } /** @@ -60,19 +53,17 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; uint32_t efa_device_caps_orig; bool neuron_initialized_orig; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + ofi_hmem_disable_p2p = 1; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - neuron_initialized_orig = hmem_ops[FI_HMEM_NEURON].initialized; hmem_ops[FI_HMEM_NEURON].initialized = true; efa_device_caps_orig = g_device_list[0].device_caps; @@ -80,8 +71,7 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) /* neuron_alloc should not be called when p2p is disabled. efa_mock_neuron_alloc_return_mock will fail the test when it is called. */ g_efa_unit_test_mocks.neuron_alloc = efa_mock_neuron_alloc_return_mock; - ofi_hmem_disable_p2p = 1; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ ofi_hmem_disable_p2p = 0; @@ -89,11 +79,8 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) hmem_ops[FI_HMEM_NEURON].initialized = neuron_initialized_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_true(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_NEURON].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_NEURON].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); } #else void test_efa_hmem_info_update_neuron() @@ -118,36 +105,30 @@ void test_efa_hmem_info_disable_p2p_cuda(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; bool cuda_initialized_orig; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + ofi_hmem_disable_p2p = 1; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - cuda_initialized_orig = hmem_ops[FI_HMEM_CUDA].initialized; hmem_ops[FI_HMEM_CUDA].initialized = true; /* ofi_cudaMalloc should not be called when p2p is disabled. efa_mock_ofi_cudaMalloc_return_mock will fail the test when it is called. */ g_efa_unit_test_mocks.ofi_cudaMalloc = efa_mock_ofi_cudaMalloc_return_mock; - ofi_hmem_disable_p2p = 1; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ ofi_hmem_disable_p2p = 0; hmem_ops[FI_HMEM_CUDA].initialized = cuda_initialized_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_true(efa_domain->hmem_info[FI_HMEM_CUDA].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_CUDA].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_CUDA].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device); } #else void test_efa_hmem_info_disable_p2p_cuda() diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index 6a53ea381a8..febb386f4f3 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -15,7 +15,7 @@ void test_info_open_ep_with_wrong_info() struct fid_ep *ep = NULL; int err; - hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); err = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); assert_int_equal(err, 0); @@ -113,7 +113,7 @@ void test_info_tx_rx_msg_order_rdm_order_none(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); test_info_tx_rx_msg_order_from_hints(resource->hints, 0); @@ -123,7 +123,7 @@ void test_info_tx_rx_msg_order_rdm_order_sas(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -135,7 +135,7 @@ void test_info_tx_rx_msg_order_dgram_order_none(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); assert_non_null(resource->hints); test_info_tx_rx_msg_order_from_hints(resource->hints, 0); @@ -149,7 +149,7 @@ void test_info_tx_rx_msg_order_dgram_order_sas(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -157,11 +157,94 @@ void test_info_tx_rx_msg_order_dgram_order_sas(struct efa_resource **state) test_info_tx_rx_msg_order_from_hints(resource->hints, -FI_ENODATA); } +/** + * @brief Verify max order size is set correctly according to hints + * + * @param hints hints + * @param expected_ret expected rc of fi_getinfo + * @param expected_size expected value of max_order_*_size. Ignored when expected_ret is non-zero. + */ +static void +test_info_max_order_size_from_hints(struct fi_info *hints, int expected_ret, size_t expected_size) +{ + struct fi_info *info; + int err; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, expected_ret); + + if (expected_ret == FI_SUCCESS) { + assert_true(info->ep_attr->max_order_raw_size == expected_size); + assert_true(info->ep_attr->max_order_war_size == expected_size); + assert_true(info->ep_attr->max_order_waw_size == expected_size); + } + + fi_freeinfo(info); +} + +/** + * DGRAM ep type doesn't support FI_ATOMIC, fi_getinfo should return + * ENODATA when FI_ATOMIC is requested in hints. + */ +void test_info_max_order_size_dgram_with_atomic(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); + assert_non_null(resource->hints); + + resource->hints->caps = FI_ATOMIC; + + test_info_max_order_size_from_hints(resource->hints, -FI_ENODATA, 0); +} + +/** + * RDM ep type supports FI_ATOMIC. When FI_ORDER_ATOMIC_* is NOT requested, + * max_order_*_size should be 0 + */ +void test_info_max_order_size_rdm_with_atomic_no_order(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + assert_non_null(resource->hints); + + + resource->hints->caps = FI_ATOMIC; + resource->hints->domain_attr->mr_mode |= FI_MR_VIRT_ADDR | FI_MR_PROV_KEY; + + test_info_max_order_size_from_hints(resource->hints, FI_SUCCESS, 0); +} + +/** + * RDM ep type supports FI_ATOMIC. When FI_ORDER_ATOMIC_* is requested, + * max_order_*_size should be the max atomic size derived from mtu and headers + */ +void test_info_max_order_size_rdm_with_atomic_order(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t max_atomic_size = g_device_list[0].rdm_info->ep_attr->max_msg_size + - sizeof(struct efa_rdm_rta_hdr) + - g_device_list[0].rdm_info->src_addrlen + - EFA_RDM_IOV_LIMIT * sizeof(struct fi_rma_iov); + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); + assert_non_null(resource->hints); + + resource->hints->caps = FI_ATOMIC; + resource->hints->domain_attr->mr_mode |= FI_MR_VIRT_ADDR | FI_MR_PROV_KEY; + resource->hints->tx_attr->msg_order |= FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; + resource->hints->rx_attr->msg_order = resource->hints->tx_attr->msg_order; + + test_info_max_order_size_from_hints(resource->hints, FI_SUCCESS, max_atomic_size); +} + void test_info_tx_rx_op_flags_rdm(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->op_flags = FI_DELIVERY_COMPLETE; @@ -173,7 +256,7 @@ void test_info_tx_rx_size_rdm(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->size = 16; @@ -234,7 +317,7 @@ void test_info_check_shm_info_hmem() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; test_info_check_shm_info_from_hints(hints); @@ -247,7 +330,7 @@ void test_info_check_shm_info_op_flags() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->tx_attr->op_flags |= FI_COMPLETION; hints->rx_attr->op_flags |= FI_COMPLETION; @@ -262,7 +345,7 @@ void test_info_check_shm_info_threading() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->domain_attr->threading = FI_THREAD_DOMAIN; test_info_check_shm_info_from_hints(hints); @@ -280,7 +363,7 @@ void test_info_check_hmem_cuda_support_on_api_lt_1_18() if (!hmem_ops[FI_HMEM_CUDA].initialized) skip(); - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; hints->domain_attr->mr_mode |= FI_MR_HMEM; @@ -319,7 +402,7 @@ void test_info_check_hmem_cuda_support_on_api_ge_1_18() if (!hmem_ops[FI_HMEM_CUDA].initialized) skip(); - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; hints->domain_attr->mr_mode |= FI_MR_HMEM; @@ -346,7 +429,7 @@ void test_info_check_no_hmem_support_when_not_requested() struct fi_info *hints, *info = NULL; int err; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); err = fi_getinfo(FI_VERSION(1,6), NULL, NULL, 0, hints, &info); assert_int_equal(err, 0); @@ -384,7 +467,7 @@ void test_use_device_rdma( const int env_val, unsetenv("FI_EFA_USE_DEVICE_RDMA"); } - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(api_version, NULL, NULL, 0ULL, hints, &info); assert_int_equal(ret, 0); @@ -448,7 +531,7 @@ static int get_first_nic_name(char **name) { char *nic_name = NULL; struct fi_info *hints, *info; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); fi_freeinfo(hints); if (ret) @@ -483,7 +566,7 @@ static void test_efa_nic_selection(const char *filter, const char *expect_first_ struct fi_info *hints, *info; efa_env.iface = (char *) filter; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); fi_freeinfo(hints); if (expect_first_name) { diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index ee97098d001..d05ded33e0f 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -88,6 +88,12 @@ void efa_mock_ibv_wr_send_verify_handshake_pkt_local_host_id_and_save_wr(struct return efa_mock_ibv_wr_send_save_wr(qp); } +void efa_mock_ibv_wr_send_imm_save_wr(struct ibv_qp_ex *qp, __be32 imm_data) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *)qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + void efa_mock_ibv_wr_set_inline_data_list_no_op(struct ibv_qp_ex *qp, size_t num_buf, const struct ibv_data_buf *buf_list) @@ -182,6 +188,11 @@ uint32_t efa_mock_ibv_read_wc_flags_return_mock(struct ibv_cq_ex *current) return mock(); } +bool efa_mock_efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq) +{ + return mock(); +} + int g_ofi_copy_from_hmem_iov_call_counter; ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, @@ -197,6 +208,32 @@ int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope) return mock(); } +bool efa_mock_efa_device_support_unsolicited_write_recv() +{ + return mock(); +} + +int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + return mock(); +} + +void efa_mock_ibv_wr_rdma_read_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *)qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + +void efa_mock_ibv_wr_rdma_write_imm_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, + __be32 imm_data) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *) qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + struct efa_unit_test_mocks g_efa_unit_test_mocks = { .local_host_id = 0, .peer_host_id = 0, @@ -213,6 +250,7 @@ struct efa_unit_test_mocks g_efa_unit_test_mocks = { #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, #if HAVE_EFADV_QUERY_MR .efadv_query_mr = __real_efadv_query_mr, @@ -347,6 +385,11 @@ int __wrap_efa_rdm_pke_read(struct efa_rdm_ope *ope) return g_efa_unit_test_mocks.efa_rdm_pke_read(ope); } +bool __wrap_efa_device_support_unsolicited_write_recv(void) +{ + return g_efa_unit_test_mocks.efa_device_support_unsolicited_write_recv(); +} + enum ibv_fork_status __wrap_ibv_is_fork_initialized(void) { return g_efa_unit_test_mocks.ibv_is_fork_initialized(); diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index ec9af71b7ec..3c256a24075 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -72,6 +72,10 @@ uint32_t efa_mock_ibv_read_qp_num_return_mock(struct ibv_cq_ex *current); uint32_t efa_mock_ibv_read_wc_flags_return_mock(struct ibv_cq_ex *current); +bool efa_mock_efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq); + +void efa_mock_ibv_wr_send_imm_save_wr(struct ibv_qp_ex *qp, __be32 imm_data); + ssize_t __real_ofi_copy_from_hmem_iov(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, const struct iovec *hmem_iov, @@ -85,8 +89,22 @@ ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size, int __real_efa_rdm_pke_read(struct efa_rdm_ope *ope); +bool __real_efa_device_support_unsolicited_write_recv(); + int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope); +bool efa_mock_efa_device_support_unsolicited_write_recv(void); + +int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +void efa_mock_ibv_wr_rdma_read_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); + +void efa_mock_ibv_wr_rdma_write_imm_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, + __be32 imm_data); + struct efa_unit_test_mocks { uint64_t local_host_id; @@ -118,6 +136,8 @@ struct efa_unit_test_mocks int (*efa_rdm_pke_read)(struct efa_rdm_ope *ope); + bool (*efa_device_support_unsolicited_write_recv)(void); + enum ibv_fork_status (*ibv_is_fork_initialized)(void); #if HAVE_EFADV_QUERY_MR diff --git a/prov/efa/test/efa_unit_test_mr.c b/prov/efa/test/efa_unit_test_mr.c index 71ccb8e7a35..5516d4f325e 100644 --- a/prov/efa/test/efa_unit_test_mr.c +++ b/prov/efa/test/efa_unit_test_mr.c @@ -11,7 +11,7 @@ void test_efa_mr_reg_counters(struct efa_resource **state) char *buf; struct fid_mr *mr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); assert_true(efa_domain->ibv_mr_reg_ct == 0); diff --git a/prov/efa/test/efa_unit_test_msg.c b/prov/efa/test/efa_unit_test_msg.c new file mode 100644 index 00000000000..b0df253fbeb --- /dev/null +++ b/prov/efa/test/efa_unit_test_msg.c @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + +#include "efa_unit_tests.h" +#include "ofi_util.h" + + +static void test_efa_msg_recv_prep(struct efa_resource *resource, + fi_addr_t *addr) +{ + struct ibv_qp *ibv_qp; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qp = base_ep->qp->ibv_qp; + ibv_qp->context->ops.post_recv = &efa_mock_ibv_post_recv; + will_return(efa_mock_ibv_post_recv, 0); + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); +} + +void test_efa_msg_fi_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + void *desc; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + ret = fi_recv(resource->ep, send_buff.buff, send_buff.size, desc, addr, + NULL /* context */); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_recvv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct iovec iov; + fi_addr_t addr; + int ret; + void *desc; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + ret = fi_recvv(resource->ep, &iov, &desc, 1, addr, NULL /* context */); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_recvmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + void *desc; + struct iovec iov; + struct fi_msg msg = {0}; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + efa_unit_test_construct_msg(&msg, &iov, 1, addr, NULL, 0, &desc); + + ret = fi_recvmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +static void test_efa_msg_send_prep(struct efa_resource *resource, + fi_addr_t *addr) +{ + struct ibv_qp_ex *ibv_qpx; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qpx = base_ep->qp->ibv_qp_ex; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); + + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_send = &efa_mock_ibv_wr_send_save_wr; + ibv_qpx->wr_send_imm = &efa_mock_ibv_wr_send_imm_save_wr; + ibv_qpx->wr_set_inline_data_list = &efa_mock_ibv_wr_set_inline_data_list_no_op; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; +} + +void test_efa_msg_fi_send(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + void *desc; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, desc, addr, + NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_sendv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + struct iovec iov; + void *desc; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_sendv(resource->ep, &iov, &desc, 1, addr, NULL); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_sendmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + struct iovec iov; + void *desc; + int ret; + struct fi_msg msg = {0}; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + efa_unit_test_construct_msg(&msg, &iov, 1, addr, NULL, 0, &desc); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_sendmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_senddata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + void *desc; + int ret; + uint64_t data = 0x1234567890ABCDEF; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_senddata(resource->ep, send_buff.buff, send_buff.size, desc, + data, addr, NULL); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_inject(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 32); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_inject(resource->ep, send_buff.buff, send_buff.size, addr); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_injectdata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + uint64_t data = 0x1234567890ABCDEF; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 32); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_injectdata(resource->ep, send_buff.buff, send_buff.size, data, + addr); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} diff --git a/prov/efa/test/efa_unit_test_ope.c b/prov/efa/test/efa_unit_test_ope.c index d5229cbcc18..701e2bb8c68 100644 --- a/prov/efa/test/efa_unit_test_ope.c +++ b/prov/efa/test/efa_unit_test_ope.c @@ -65,7 +65,7 @@ void test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts(struct efa_res struct efa_resource *resource = *state; struct efa_rdm_ep *efa_rdm_ep; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->efa_outstanding_tx_ops = efa_rdm_ep->efa_max_outstanding_tx_ops - 1; @@ -88,7 +88,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory(struct efa_resource **sta int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* data size should be aligned and evenly distributed. * alignment for host memory is 8 byte by default. @@ -137,7 +137,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory_align128(struct efa_resou int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = true; @@ -186,7 +186,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory(struct efa_resource **sta int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* default alignment of cuda memory is 64 bytes */ msg_length = 12000; @@ -211,7 +211,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory_align128(struct efa_resou int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = true; @@ -243,7 +243,7 @@ void test_efa_rdm_ope_post_write_0_byte(struct efa_resource **state) fi_addr_t addr; int ret, err; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); @@ -314,7 +314,7 @@ void test_efa_rdm_rxe_post_local_read_or_queue_cleanup_txe(struct efa_resource * */ g_efa_unit_test_mocks.efa_rdm_pke_read = &efa_mock_efa_rdm_pke_read_return_mock; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_pke.c b/prov/efa/test/efa_unit_test_pke.c index d52ccf76cc3..e7fda0365a1 100644 --- a/prov/efa/test/efa_unit_test_pke.c +++ b/prov/efa/test/efa_unit_test_pke.c @@ -24,7 +24,7 @@ void test_efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_resource **s int err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_rdm_peer.c b/prov/efa/test/efa_unit_test_rdm_peer.c index 1170ef9b999..da909ed4905 100644 --- a/prov/efa/test/efa_unit_test_rdm_peer.c +++ b/prov/efa/test/efa_unit_test_rdm_peer.c @@ -81,7 +81,7 @@ void test_efa_rdm_peer_reorder_expected_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 0; exp_msg_id = 0; @@ -96,7 +96,7 @@ void test_efa_rdm_peer_reorder_smaller_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 1; exp_msg_id = 10; @@ -110,7 +110,7 @@ void test_efa_rdm_peer_reorder_larger_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 10; exp_msg_id = 0; @@ -125,7 +125,7 @@ void test_efa_rdm_peer_reorder_overflow_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 16384; exp_msg_id = 0; @@ -192,7 +192,7 @@ void test_efa_rdm_peer_move_overflow_pke_to_recvwin(struct efa_resource **state) struct efa_rdm_peer *peer; struct efa_rdm_pke *pkt_entry; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* overflow_pke_list has a pkt entry with msg_id 18000. * After calling efa_rdm_peer_move_overflow_pke_to_recvwin when exp_msg_id = 16384, @@ -213,7 +213,7 @@ void test_efa_rdm_peer_keep_pke_in_overflow_list(struct efa_resource **state) { struct efa_rdm_peer_overflow_pke_list_entry *overflow_pke_list_entry; struct dlist_entry *tmp; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* overflow_pke_list has a pkt entry with msg_id 33000. * After calling efa_rdm_peer_move_overflow_pke_to_recvwin when exp_msg_id = 16384, @@ -269,7 +269,7 @@ void test_efa_rdm_peer_append_overflow_pke_to_recvwin(struct efa_resource **stat struct efa_rdm_ep *efa_rdm_ep; int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_rma.c b/prov/efa/test/efa_unit_test_rma.c new file mode 100644 index 00000000000..fd5818657ba --- /dev/null +++ b/prov/efa/test/efa_unit_test_rma.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + +#include "efa_unit_tests.h" +#include "ofi_util.h" + +extern struct fi_ops_rma efa_rma_ops; + +static void test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) +{ + struct ibv_qp_ex *ibv_qpx; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + /* Add rma caps explicitly to ep->info to allow local test */ + base_ep->info->caps |= FI_RMA; + ibv_qpx = base_ep->qp->ibv_qp_ex; + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_rdma_read = &efa_mock_ibv_wr_rdma_read_save_wr; + ibv_qpx->wr_rdma_write = &efa_mock_ibv_wr_rdma_write_save_wr; + ibv_qpx->wr_rdma_write_imm = &efa_mock_ibv_wr_rdma_write_imm_save_wr; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); +} + +void test_efa_rma_read(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t src_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_read(resource->ep, local_buff.buff, local_buff.size, desc, + src_addr, remote_addr, remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_readv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + fi_addr_t src_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_readv(resource->ep, &iov, &desc, 1, src_addr, remote_addr, + remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_readmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t src_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, src_addr, + &rma_iov, 1, NULL, 0); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_readmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_write(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_write(resource->ep, local_buff.buff, local_buff.size, desc, + dest_addr, remote_addr, remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writev(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writev(resource->ep, &iov, &desc, 1, dest_addr, remote_addr, + remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writemsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t dest_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, dest_addr, &rma_iov, + 1, NULL, 0); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writemsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writedata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writedata(resource->ep, local_buff.buff, local_buff.size, desc, + 0, dest_addr, remote_addr, remote_key, + NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_inject_write(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + + ret = fi_inject_write(resource->ep, local_buff.buff, local_buff.size, + dest_addr, remote_addr, remote_key); + assert_int_equal(ret, -FI_ENOSYS); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_inject_writedata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + + ret = fi_inject_writedata(resource->ep, local_buff.buff, + local_buff.size, 0, dest_addr, remote_addr, + remote_key); + assert_int_equal(ret, -FI_ENOSYS); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writemsg_with_inject(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t dest_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, dest_addr, &rma_iov, + 1, NULL, 0); + + ret = fi_writemsg(resource->ep, &msg, FI_INJECT); + assert_int_equal(ret, -FI_ENOSYS); + + efa_unit_test_buff_destruct(&local_buff); +} diff --git a/prov/efa/test/efa_unit_test_rnr.c b/prov/efa/test/efa_unit_test_rnr.c index 411cc030dd2..bca4dd627b8 100644 --- a/prov/efa/test/efa_unit_test_rnr.c +++ b/prov/efa/test/efa_unit_test_rnr.c @@ -21,7 +21,8 @@ void test_efa_rnr_queue_and_resend(struct efa_resource **state) fi_addr_t peer_addr; int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); /* Create and register a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -40,13 +41,6 @@ void test_efa_rnr_queue_and_resend(struct efa_resource **state) efa_rdm_ep->base_ep.qp->ibv_qp_ex->wr_complete = &efa_mock_ibv_wr_complete_no_op; assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(ret, 0); - efa_rdm_ep->shm_ep = NULL; - } - ret = fi_send(resource->ep, send_buff.buff, send_buff.size, fi_mr_desc(send_buff.mr), peer_addr, NULL /* context */); assert_int_equal(ret, 0); assert_false(dlist_empty(&efa_rdm_ep->txe_list)); diff --git a/prov/efa/test/efa_unit_test_runt.c b/prov/efa/test/efa_unit_test_runt.c index ab7537061c0..5a49d0775ac 100644 --- a/prov/efa/test/efa_unit_test_runt.c +++ b/prov/efa/test/efa_unit_test_runt.c @@ -27,12 +27,10 @@ void test_efa_rdm_peer_get_runt_size_impl( struct efa_mr mock_mr; struct efa_rdm_ope mock_txe; size_t runt_size; - struct efa_domain *efa_domain; int ret; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_domain = efa_rdm_ep_domain(efa_rdm_ep); - efa_domain->hmem_info[iface].runt_size = total_runt_size; + g_efa_hmem_info[iface].runt_size = total_runt_size; /* insert a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -63,7 +61,7 @@ void test_efa_rdm_peer_get_runt_size_no_enough_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1001; @@ -81,7 +79,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_smaller_than_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -99,7 +97,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_exceeding_total_len(struct efa_ size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 0; @@ -117,7 +115,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_normal(struct efa_resource **st size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 10000; @@ -137,7 +135,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_128_multiple_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -160,7 +158,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_non_128_multiple_alignment(stru size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -183,7 +181,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_smaller_than_128_alignment(stru size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -204,7 +202,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_exceeding_total_len_128_alignme size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -224,7 +222,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_smaller_than_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -242,7 +240,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_exceeding_total_len(struct efa_ size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 1111; peer_num_runt_bytes_in_flight = 0; @@ -260,7 +258,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_normal(struct efa_resource **st size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 10000; @@ -296,13 +294,11 @@ void test_efa_rdm_peer_select_readbase_rtm_impl( fi_addr_t addr; struct efa_mr mock_mr; struct efa_rdm_ope mock_txe; - struct efa_domain *efa_domain; int readbase_rtm; int ret; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_domain = efa_rdm_ep_domain(efa_rdm_ep); - efa_domain->hmem_info[iface].runt_size = total_runt_size; + g_efa_hmem_info[iface].runt_size = total_runt_size; /* insert a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -334,7 +330,7 @@ void test_efa_rdm_peer_select_readbase_rtm_no_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -351,7 +347,7 @@ void test_efa_rdm_peer_select_readbase_rtm_do_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; diff --git a/prov/efa/test/efa_unit_test_send.c b/prov/efa/test/efa_unit_test_send.c index b3ed1a7873c..3b811e12222 100644 --- a/prov/efa/test/efa_unit_test_send.c +++ b/prov/efa/test/efa_unit_test_send.c @@ -20,7 +20,7 @@ void test_efa_rdm_msg_send_to_local_peer_with_null_desc(struct efa_resource **st struct fi_msg msg = {0}; struct fi_msg_tagged tmsg = {0}; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index 733faa67d57..57ce6402b70 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -18,21 +18,19 @@ void test_efa_srx_min_multi_recv_size(struct efa_resource **state) struct util_srx_ctx *srx_ctx; size_t min_multi_recv_size_new; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); - /* - * After ep is enabled, the srx->min_multi_recv_size should be - * exactly the same with ep->min_multi_recv_size - */ - assert_true(efa_rdm_ep->min_multi_recv_size == srx_ctx->min_multi_recv_size); /* Set a new min_multi_recv_size via setopt*/ min_multi_recv_size_new = 1024; assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, &min_multi_recv_size_new, sizeof(min_multi_recv_size_new)), 0); + /* Enable EP */ + assert_int_equal(fi_enable(resource->ep), FI_SUCCESS); + /* Check whether srx->min_multi_recv_size is set correctly */ + srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); assert_true(srx_ctx->min_multi_recv_size == min_multi_recv_size_new); } @@ -44,7 +42,7 @@ void test_efa_srx_cq(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct util_srx_ctx *srx_ctx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); @@ -59,7 +57,7 @@ void test_efa_srx_lock(struct efa_resource **state) struct util_srx_ctx *srx_ctx; struct efa_domain *efa_domain; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 883130d2320..93991120fd4 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -61,6 +61,7 @@ static int efa_unit_test_mocks_teardown(void **state) #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, }; @@ -105,22 +106,31 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_atomic_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_disabled, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_close_discard_posted_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_support_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_default_sizes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_recover_forgotten_peer_ah, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_ignore_removed_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -129,6 +139,9 @@ int main(void) cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_dgram_with_atomic, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_rdm_with_atomic_no_order, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_rdm_with_atomic_order, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_op_flags_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_size_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_check_shm_info_hmem, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -189,7 +202,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_cq_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_cntr_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_expected_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_smaller_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_larger_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -198,6 +211,39 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_peer_keep_pke_in_overflow_list, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_append_overflow_pke_to_recvwin, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_pke_handle_longcts_rtm_send_completion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recvv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recvmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_send, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_sendv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_sendmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_senddata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_injectdata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_read, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_readv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_readmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writev, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writemsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_inject_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_inject_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writemsg_with_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_send_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_send_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_open, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_getopt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_use_device_rdma, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_hmem_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_rnr_retry, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_sizes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_bind_and_enable, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 0182f135569..bfe0b4c0aee 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -22,6 +22,9 @@ extern struct efa_mock_ibv_send_wr_list g_ibv_send_wr_list; extern struct efa_unit_test_mocks g_efa_unit_test_mocks; extern struct efa_env efa_env; +#define EFA_DIRECT_PROV_NAME "efa-direct" +#define EFA_PROV_NAME "efa" + struct efa_resource { struct fi_info *hints; struct fi_info *info; @@ -33,17 +36,19 @@ struct efa_resource { struct fid_cq *cq; }; -struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type); +struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type, char *prov_name); -void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type); +void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_ep_not_enabled( - struct efa_resource *resource, enum fi_ep_type ep_type); + struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled( - struct efa_resource *resource, enum fi_ep_type ep_type); + struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, - bool enable_ep, bool open_cq); + bool enable_ep, bool open_cq, char *prov_name); + +void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *resource); void efa_unit_test_resource_destruct(struct efa_resource *resource); @@ -58,6 +63,12 @@ void efa_unit_test_construct_tmsg(struct fi_msg_tagged *tmsg, struct iovec *iov, void **desc, uint64_t tag, uint64_t ignore); +void efa_unit_test_construct_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, + void **desc, size_t iov_count, + fi_addr_t addr, struct fi_rma_iov *rma_iov, + size_t rma_iov_count, void *context, + uint64_t data); + void new_temp_file(char *template, size_t len); struct efa_unit_test_buff { @@ -118,22 +129,31 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); void test_efa_rdm_ep_user_zcpy_rx_disabled(); -void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy(); +void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(); void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(); void test_efa_rdm_ep_close_discard_posted_recv(); void test_efa_rdm_ep_zcpy_recv_cancel(); +void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); +void test_efa_rdm_ep_support_unsolicited_write_recv(); +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(); +void test_efa_rdm_ep_default_sizes(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll(); void test_rdm_cq_create_error_handling(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id(); +void test_rdm_cq_read_bad_send_status_unreachable_receiver(); void test_rdm_cq_read_bad_send_status_invalid_qpn(); void test_rdm_cq_read_bad_send_status_message_too_long(); void test_ibv_cq_ex_read_bad_recv_status(); +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv(); +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv(); void test_ibv_cq_ex_read_recover_forgotten_peer_ah(); void test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer(); void test_ibv_cq_ex_read_ignore_removed_peer(); @@ -142,6 +162,9 @@ void test_info_tx_rx_msg_order_rdm_order_none(); void test_info_tx_rx_msg_order_rdm_order_sas(); void test_info_tx_rx_msg_order_dgram_order_none(); void test_info_tx_rx_msg_order_dgram_order_sas(); +void test_info_max_order_size_dgram_with_atomic(); +void test_info_max_order_size_rdm_with_atomic_no_order(); +void test_info_max_order_size_rdm_with_atomic_order(); void test_info_tx_rx_op_flags_rdm(); void test_info_tx_rx_size_rdm(); void test_info_check_shm_info_hmem(); @@ -202,7 +225,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); void test_efa_rdm_cq_post_initial_rx_pkts(); void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); -void test_efa_cntr_post_initial_rx_pkts(); +void test_efa_rdm_cntr_post_initial_rx_pkts(); void test_efa_rdm_peer_reorder_expected_msg_id(); void test_efa_rdm_peer_reorder_smaller_msg_id(); void test_efa_rdm_peer_reorder_larger_msg_id(); @@ -211,6 +234,39 @@ void test_efa_rdm_peer_move_overflow_pke_to_recvwin(); void test_efa_rdm_peer_keep_pke_in_overflow_list(); void test_efa_rdm_peer_append_overflow_pke_to_recvwin(); void test_efa_rdm_pke_handle_longcts_rtm_send_completion(); +void test_efa_msg_fi_recv(); +void test_efa_msg_fi_recvv(); +void test_efa_msg_fi_recvmsg(); +void test_efa_msg_fi_send(); +void test_efa_msg_fi_sendv(); +void test_efa_msg_fi_sendmsg(); +void test_efa_msg_fi_senddata(); +void test_efa_msg_fi_inject(); +void test_efa_msg_fi_injectdata(); +void test_efa_rma_read(); +void test_efa_rma_readv(); +void test_efa_rma_readmsg(); +void test_efa_rma_write(); +void test_efa_rma_writev(); +void test_efa_rma_writemsg(); +void test_efa_rma_writedata(); +void test_efa_rma_inject_write(); +void test_efa_rma_inject_writedata(); +void test_efa_rma_writemsg_with_inject(); +void test_efa_cq_read_send_success(); +void test_efa_cq_read_recv_success(); +void test_efa_cq_read_send_failure(); +void test_efa_cq_read_recv_failure(); +void test_efa_ep_open(); +void test_efa_ep_cancel(); +void test_efa_ep_getopt(); +void test_efa_ep_setopt_use_device_rdma(); +void test_efa_ep_setopt_hmem_p2p(); +void test_efa_ep_setopt_rnr_retry(); +void test_efa_ep_setopt_sizes(); +void test_efa_ep_bind_and_enable(); +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head) diff --git a/prov/hook/hook_hmem/src/hook_hmem.c b/prov/hook/hook_hmem/src/hook_hmem.c index 0a9a3c2b84d..44daa4fb621 100644 --- a/prov/hook/hook_hmem/src/hook_hmem.c +++ b/prov/hook/hook_hmem/src/hook_hmem.c @@ -1914,7 +1914,7 @@ static int hook_hmem_domain(struct fid_fabric *fabric, struct fi_info *info, HOOK_HMEM_INI { -#ifdef HAVE_HOOK_HMEM_DL +#if HAVE_HOOK_HMEM_DL ofi_hmem_init(); #endif hook_hmem_fabric_ops = hook_fabric_ops; diff --git a/prov/hook/trace/src/hook_trace.c b/prov/hook/trace/src/hook_trace.c index 5813d47bce3..b5cdce4fb4f 100644 --- a/prov/hook/trace/src/hook_trace.c +++ b/prov/hook/trace/src/hook_trace.c @@ -262,6 +262,8 @@ static void hook_trace_prof_init(void *context) fi_tostr_r(buf,len, info->tx_attr, FI_TYPE_TX_ATTR)); \ FI_TRACE(dom->fabric->hprov, FI_LOG_DOMAIN, "\n%s", \ fi_tostr_r(buf,len, info->rx_attr, FI_TYPE_RX_ATTR)); \ + FI_TRACE(dom->fabric->hprov, FI_LOG_DOMAIN, "\n%s", \ + fi_tostr_r(buf,len, info->domain_attr, FI_TYPE_DOMAIN_ATTR)); \ } while (0); typedef void (*trace_cq_entry_fn)(const struct fi_provider *prov, diff --git a/prov/lnx/Makefile.include b/prov/lnx/Makefile.include new file mode 100644 index 00000000000..cd23049e845 --- /dev/null +++ b/prov/lnx/Makefile.include @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 ORNL. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +if HAVE_LNX +_lnx_files = \ + prov/lnx/src/lnx_cq.c \ + prov/lnx/src/lnx_domain.c \ + prov/lnx/src/lnx_ep.c \ + prov/lnx/src/lnx_init.c \ + prov/lnx/src/lnx_ops.c \ + prov/lnx/src/lnx_av.c + +_lnx_headers = \ + prov/lnx/include/lnx.h + +if HAVE_LNX_DL +pkglib_LTLIBRARIES += liblnx-fi.la +liblnx_fi_la_SOURCES = $(_lnx_files) $(_lnx_headers) +liblnx_fi_la_LIBADD = $(linkback) $(lnx_LIBS) +liblnx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +liblnx_fi_la_DEPENDENCIES = $(linkback) +else +src_libfabric_la_SOURCES += $(_lnx_files) $(_lnx_headers) +src_libfabric_la_CPPFLAGS += -I$(top_srcdir)/prov/lnx/include +endif + +prov_install_man_pages += man/man7/fi_lnx.7 + +endif HAVE_LNX + +prov_dist_man_pages += man/man7/fi_lnx.7 diff --git a/prov/lnx/configure.m4 b/prov/lnx/configure.m4 new file mode 100644 index 00000000000..737b62bc46d --- /dev/null +++ b/prov/lnx/configure.m4 @@ -0,0 +1,15 @@ +dnl Configury specific to the libfabric lnx provider + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_LNX_CONFIGURE],[ + # Determine if we can support the lnx provider + lnx_happy=0 + AS_IF([test x"$enable_lnx" != x"no"], [lnx_happy=1]) + AS_IF([test $lnx_happy -eq 1], [$1], [$2]) +]) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h new file mode 100644 index 00000000000..3d6506891e4 --- /dev/null +++ b/prov/lnx/include/lnx.h @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LNX_H +#define LNX_H + +#define LNX_MAX_LOCAL_EPS 16 +#define LNX_IOV_LIMIT 4 + +#define lnx_ep_rx_flags(lnx_ep) ((lnx_ep)->le_ep.rx_op_flags) + +struct local_prov_ep; + +struct lnx_match_attr { + fi_addr_t lm_addr; + uint64_t lm_tag; + uint64_t lm_ignore; + struct lnx_peer *lm_peer; + struct local_prov_ep *lm_cep; +}; + +struct lnx_peer_cq { + struct lnx_cq *lpc_shared_cq; + struct fid_peer_cq lpc_cq; + struct fid_cq *lpc_core_cq; +}; + +struct lnx_queue { + struct dlist_entry lq_queue; + dlist_func_t *lq_match_func; + ofi_spin_t lq_qlock; +}; + +struct lnx_qpair { + struct lnx_queue lqp_recvq; + struct lnx_queue lqp_unexq; +}; + +struct lnx_peer_srq { + struct lnx_qpair lps_trecv; + struct lnx_qpair lps_recv; +}; + +struct local_prov_ep { + struct dlist_entry entry; + bool lpe_local; + char lpe_fabric_name[FI_NAME_MAX]; + struct fid_fabric *lpe_fabric; + struct fid_domain *lpe_domain; + struct fid_ep *lpe_ep; + struct fid_ep **lpe_txc; + struct fid_ep **lpe_rxc; + struct fid_av *lpe_av; + struct fid_ep *lpe_srx_ep; + struct lnx_peer_cq lpe_cq; + struct fi_info *lpe_fi_info; + struct fid_peer_srx lpe_srx; + struct ofi_bufpool *lpe_recv_bp; + ofi_spin_t lpe_bplock; + struct local_prov *lpe_parent; +}; + +struct lnx_rx_entry { + /* the entry which will be passed to the core provider */ + struct fi_peer_rx_entry rx_entry; + /* iovec to use to point to receive buffers */ + struct iovec rx_iov[LNX_IOV_LIMIT]; + /* desc array to be used to point to the descs passed by the user */ + void *rx_desc[LNX_IOV_LIMIT]; + /* peer we expect messages from. + * This is available if the receive request provided a source address. + * Otherwise it will be NULL + */ + struct lnx_peer *rx_peer; + /* local prov endpoint receiving the message if this entry is + * added to the SUQ + */ + struct local_prov_ep *rx_cep; + /* match information which will be given to us by the core provider */ + struct fi_peer_match_attr rx_match_info; + /* ignore bit passed in by the user */ + uint64_t rx_ignore; + /* which pool this rx_entry came from. It's either from the global + * pool or some core provider pool + */ + bool rx_global; +}; + +OFI_DECLARE_FREESTACK(struct lnx_rx_entry, lnx_recv_fs); + +struct local_prov { + struct dlist_entry lpv_entry; + char lpv_prov_name[FI_NAME_MAX]; + int lpv_ep_count; + struct dlist_entry lpv_prov_eps; +}; + +struct lnx_address_prov { + char lap_prov[FI_NAME_MAX]; + /* an array of addresses of size count. */ + /* entry 0 is shm if available */ + /* array can't be larger than LNX_MAX_LOCAL_EPS */ + int lap_addr_count; + /* size as specified by the provider */ + int lap_addr_size; + /* payload */ + char lap_addrs[]; +}; + +struct lnx_addresses { + /* used to determine if the address is node local or node remote */ + char la_hostname[FI_NAME_MAX]; + /* number of providers <= LNX_MAX_LOCAL_EPS */ + int la_prov_count; + struct lnx_address_prov la_addr_prov[]; +}; + +struct lnx_local2peer_map { + struct dlist_entry entry; + struct local_prov_ep *local_ep; + int addr_count; + fi_addr_t peer_addrs[LNX_MAX_LOCAL_EPS]; +}; + +struct lnx_peer_prov { + struct dlist_entry entry; + + /* provider name */ + char lpp_prov_name[FI_NAME_MAX]; + + uint64_t lpp_flags; + + /* pointer to the local endpoint information to be used for + * communication with this peer. + * + * If the peer is on-node, then lp_endpoints[0] = shm + * + * if peer is off-node, then there could be up to LNX_MAX_LOCAL_EPS + * local endpoints we can use to reach that peer. + */ + struct local_prov *lpp_prov; + + /* each peer can be reached from any of the local provider endpoints + * on any of the addresses which are given to us. It's an N:N + * relationship + */ + struct dlist_entry lpp_map; +}; + +struct lnx_peer { + /* true if peer can be reached over shared memory, false otherwise */ + bool lp_local; + fi_addr_t lp_fi_addr; + + /* Each provider that we can reach the peer on will have an entry + * below. Each entry will contain all the local provider endpoints we + * can reach the peer through, as well as all the peer addresses on that + * provider. + * + * We can potentially multi-rail between the interfaces on the same + * provider, both local and remote. + * + * Or we can multi-rail across different providers. Although this + * might be more complicated due to the differences in provider + * capabilities. + */ + struct lnx_peer_prov *lp_shm_prov; + struct dlist_entry lp_provs; +}; + +struct lnx_peer_table { + struct util_av lpt_av; + int lpt_max_count; + struct lnx_domain *lpt_domain; + /* an array of peer entries of type struct lnx_peer */ + struct ofi_bufpool *lpt_entries; +}; + +struct lnx_ctx { + struct dlist_entry ctx_head; + int ctx_idx; + struct lnx_ep *ctx_parent; + struct fid_ep ctx_ep; +}; + +struct lnx_ep { + struct util_ep le_ep; + struct dlist_entry le_tx_ctx; + struct dlist_entry le_rx_ctx; + struct lnx_domain *le_domain; + size_t le_fclass; + struct lnx_peer_table *le_peer_tbl; + struct lnx_peer_srq le_srq; +}; + +struct lnx_srx_context { + struct lnx_ep *srx_lep; + struct local_prov_ep *srx_cep; +}; + +struct lnx_mem_desc_prov { + struct local_prov *prov; + struct fid_mr *core_mr; +}; + +struct lnx_mem_desc { + struct lnx_mem_desc_prov desc[LNX_MAX_LOCAL_EPS]; + int desc_count; +}; + +struct lnx_mr { + struct ofi_mr mr; + struct lnx_mem_desc desc; +}; + +struct lnx_domain { + struct util_domain ld_domain; + struct lnx_fabric *ld_fabric; + bool ld_srx_supported; + struct ofi_mr_cache ld_mr_cache; +}; + +struct lnx_cq { + struct util_cq util_cq; + struct lnx_domain *lnx_domain; +}; + +struct lnx_fabric { + struct util_fabric util_fabric; + /* providers linked by this fabric */ + struct dlist_entry local_prov_table; + /* memory registration buffer pool */ + struct ofi_bufpool *mem_reg_bp; + /* shared memory provider used in this link */ + struct local_prov *shm_prov; + /* peers associated with this link */ + struct lnx_peer_table *lnx_peer_tbl; +}; + +extern struct util_prov lnx_util_prov; +extern struct fi_provider lnx_prov; +extern struct ofi_bufpool *global_recv_bp; +extern ofi_spin_t global_bplock; + +struct fi_info *lnx_get_link_by_dom(char *domain_name); + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info); + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context); +int lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context); + +void lnx_fini(void); + +int lnx_fabric_close(struct fid *fid); + +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context); + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); + +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr); + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_cq2ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags); + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_queue_msg(struct fi_peer_rx_entry *entry); +int lnx_queue_tag(struct fi_peer_rx_entry *entry); +void lnx_free_entry(struct fi_peer_rx_entry *entry); +void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); + +static inline +void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) +{ + if (desc && desc->desc[0].core_mr) { + if (mem_desc) + *mem_desc = desc->desc[0].core_mr->mem_desc; + return; + } + + *mem_desc = NULL; +} + +static inline +int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, + struct lnx_domain *lnx_dom, struct ofi_mr_entry **mre) +{ + struct ofi_mr *mr; + struct fi_mr_attr attr = {}; + struct fi_mr_attr cur_abi_attr; + struct ofi_mr_info info = {}; + uint64_t flags = 0; + int rc; + + attr.iov_count = 1; + attr.mr_iov = iov; + *mre = ofi_mr_cache_find(&lnx_dom->ld_mr_cache, &attr, 0); + if (*mre) { + mr = (struct ofi_mr *)(*mre)->data; + goto out; + } + + attr.iface = ofi_get_hmem_iface(iov->iov_base, + &attr.device.reserved, &flags); + info.iov = *iov; + info.iface = attr.iface; + rc = ofi_hmem_dev_register(attr.iface, iov->iov_base, iov->iov_len, + (uint64_t *) &attr.hmem_data); + if (rc) + return rc; + + rc = ofi_mr_cache_search(&lnx_dom->ld_mr_cache, &info, mre); + if (rc) { + ofi_hmem_dev_unregister(attr.iface, (uint64_t)attr.hmem_data); + return rc; + } + + mr = (struct ofi_mr *)(*mre)->data; + ofi_mr_update_attr(lnx_dom->ld_domain.fabric->fabric_fid.api_version, + lnx_dom->ld_domain.info_domain_caps, &attr, &cur_abi_attr, 0); + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr.context; + mr->domain = &lnx_dom->ld_domain; + mr->flags = flags; + mr->iface = cur_abi_attr.iface; + mr->device = cur_abi_attr.device.reserved; + mr->hmem_data = cur_abi_attr.hmem_data; + mr->mr_fid.mem_desc = (void*) mr; + +out: + return FI_SUCCESS; +} + +static inline +int lnx_select_send_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc, uint64_t *rkey) +{ + int idx = 0; + int rc; + struct lnx_peer_prov *prov; + struct lnx_local2peer_map *lpm; + struct ofi_mr *mr = NULL; + + if (lp->lp_local) { + prov = lp->lp_shm_prov; + } else { + prov = dlist_first_entry_or_null( + &lp->lp_provs, struct lnx_peer_prov, entry); + idx = 1; + } + + /* TODO when we support multi-rail we can have multiple maps */ + lpm = dlist_first_entry_or_null(&prov->lpp_map, + struct lnx_local2peer_map, entry); + *addr = lpm->peer_addrs[0]; + + /* TODO this will need to be expanded to handle Multi-Rail. For now + * the assumption is that local peers can be reached on shm and remote + * peers have only one interface, hence indexing on 0 and 1 + * + * If we did memory registration, then we've already figured out the + * pathway + */ + if (desc && desc->desc[idx].core_mr) { + *cep = dlist_first_entry_or_null( + &desc->desc[idx].prov->lpv_prov_eps, + struct local_prov_ep, entry); + if (mem_desc) + *mem_desc = fi_mr_desc(desc->desc[idx].core_mr); + if (rkey) + *rkey = fi_mr_key(desc->desc[idx].core_mr); + return 0; + } + + *cep = lpm->local_ep; + if (mem_desc) + *mem_desc = NULL; + + if (!lp->lp_local || !mem_desc || (mem_desc && *mem_desc) || + !iov || (iov && iov->iov_base == NULL)) + return 0; + + /* Look up the address in the cache: + * - if it's found then use the cached fid_mr + * - This will include the iface, which is really all we need + * - if it's not then lookup the iface, create the fid_mr and + * cache it. + */ + rc = lnx_create_mr(iov, *addr, lnx_dom, mre); + if (!rc && mre) { + mr = (struct ofi_mr *)(*mre)->data; + *mem_desc = mr->mr_fid.mem_desc; + } + + return rc; +} + +static inline +int lnx_select_recv_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc) +{ + /* if the src address is FI_ADDR_UNSPEC, then we'll need to trigger + * all core providers to listen for a receive, since we don't know + * which one will endup getting the message. + * + * For each core provider we're tracking, trigger the recv operation + * on it. + * + * if the src address is specified then we just need to select and + * exact core endpoint to trigger the recv on. + */ + if (!lp) + return -FI_ENOSYS; + + return lnx_select_send_pathway(lp, lnx_dom, desc, cep, addr, iov, + iov_count, mre, mem_desc, NULL); +} + +#endif /* LNX_H */ diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c new file mode 100644 index 00000000000..60a26f1ea28 --- /dev/null +++ b/prov/lnx/src/lnx_av.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr) +{ + struct lnx_peer *entry; + + if (addr == FI_ADDR_UNSPEC) + return NULL; + + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + + entry = ofi_bufpool_get_ibuf(peer_tbl->lpt_entries, addr); + + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); + + if (!entry) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Invalid fi_addr %#lx\n", addr); + + return entry; +} + +static int lnx_peer_av_remove(struct lnx_peer *lp) +{ + int rc, frc = 0; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + /* if this is a remote peer then we didn't insert its shm address + * into our local shm endpoint, so no need to remove it + */ + if (!strncasecmp(lpp->lpp_prov_name, "shm", 3) && + !lp->lp_local) + continue; + + /* remove these address from all local providers */ + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + if (lpm->addr_count > 0) { + rc = fi_av_remove(lpm->local_ep->lpe_av, lpm->peer_addrs, + lpm->addr_count, lpp->lpp_flags); + if (rc) + frc = rc; + } + } + } + + return frc; +} + +static int lnx_peer_remove(struct lnx_peer_table *tbl, fi_addr_t addr) +{ + struct lnx_peer *lp = NULL; + int rc = 0; + + ofi_genlock_lock(&tbl->lpt_domain->ld_domain.lock); + lp = ofi_bufpool_get_ibuf(tbl->lpt_entries, addr); + if (!lp) + goto out; + + rc = lnx_peer_av_remove(lp); + + ofi_ibuf_free(lp); + +out: + ofi_genlock_unlock(&tbl->lpt_domain->ld_domain.lock); + return rc; +} + +static int lnx_cleanup_avs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_av->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) +{ + ofi_bufpool_destroy(peer_tbl->lpt_entries); + free(peer_tbl); +} + +int lnx_av_close(struct fid *fid) +{ + int rc; + struct local_prov *entry; + struct lnx_fabric *fabric; + struct lnx_peer_table *peer_tbl; + + peer_tbl = container_of(fid, struct lnx_peer_table, lpt_av.av_fid.fid); + fabric = peer_tbl->lpt_domain->ld_fabric; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_avs(entry); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close av for %s\n", + entry->lpv_prov_name); + } + } + + ofi_av_close_lightweight(&peer_tbl->lpt_av); + + free(peer_tbl); + + return 0; +} + +static struct fi_ops lnx_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int lnx_get_or_create_peer_prov(struct dlist_entry *prov_table, + struct lnx_peer *lp, char *prov_name, + struct lnx_peer_prov **lpp) +{ + bool shm = false; + struct local_prov *entry; + struct lnx_peer_prov *peer_prov; + + if (!strcmp(prov_name, "shm")) { + if (lp->lp_shm_prov) + return -FI_ENOENT; + shm = true; + goto insert_prov; + } + + /* check if we already have a peer provider */ + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, peer_prov, entry) { + if (!strncasecmp(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX)) { + *lpp = peer_prov; + return 0; + } + } + +insert_prov: + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) { + peer_prov = calloc(sizeof(*peer_prov), 1); + if (!peer_prov) + return -FI_ENOMEM; + + dlist_init(&peer_prov->entry); + dlist_init(&peer_prov->lpp_map); + + memcpy(peer_prov->lpp_prov_name, prov_name, + FI_NAME_MAX); + + peer_prov->lpp_prov = entry; + + if (shm) + lp->lp_shm_prov = peer_prov; + else + dlist_insert_tail(&peer_prov->entry, &lp->lp_provs); + + *lpp = peer_prov; + return 0; + } + } + + return -FI_ENOENT; +} + +static inline struct lnx_address_prov * +next_prov(struct lnx_address_prov *prov) +{ + uint8_t *ptr; + + ptr = (uint8_t*) prov; + + ptr += (sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size)); + + return (struct lnx_address_prov*)ptr; +} + +static inline size_t +get_lnx_addresses_size(struct lnx_addresses *addrs) +{ + int i; + size_t s = sizeof(*addrs); + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + s += sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size); + prov = next_prov(prov); + } + + return s; +} + +static inline struct lnx_addresses * +next_peer(struct lnx_addresses *addrs) +{ + uint8_t *ptr; + + ptr = (uint8_t*)addrs + get_lnx_addresses_size(addrs); + + return (struct lnx_addresses *)ptr; +} + +static struct lnx_address_prov * +lnx_get_peer_shm_addr(struct lnx_addresses *addrs) +{ + int i; + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + if (!strcmp(prov->lap_prov, "shm")) + return prov; + prov = next_prov(prov); + } + + return NULL; +} + +static int is_local_addr(struct local_prov **shm_prov, struct lnx_addresses *la) +{ + int rc; + char hostname[FI_NAME_MAX]; + struct lnx_address_prov *lap_shm; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + lap_shm = lnx_get_peer_shm_addr(la); + if (!lap_shm) + return -FI_EOPNOTSUPP; + + /* Shared memory address not provided or not local*/ + if ((lap_shm->lap_addr_count == 0) || + strncasecmp(hostname, la->la_hostname, FI_NAME_MAX)) + return -FI_EOPNOTSUPP; + + /* badly formed address */ + if (*shm_prov && (lap_shm->lap_addr_count > 1 || + lap_shm->lap_addr_count < 0)) + return -FI_EPROTO; + + return 0; +} + +static void +lnx_update_msg_entries(struct lnx_qpair *qp, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_queue *q = &qp->lqp_unexq; + struct lnx_rx_entry *rx_entry; + struct dlist_entry *item; + + ofi_spin_lock(&q->lq_qlock); + dlist_foreach(&q->lq_queue, item) { + rx_entry = (struct lnx_rx_entry *) item; + if (rx_entry->rx_entry.addr == FI_ADDR_UNSPEC) + rx_entry->rx_entry.addr = get_addr(&rx_entry->rx_entry); + } + ofi_spin_unlock(&q->lq_qlock); +} + +void +lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_srx_context *ctxt; + + ctxt = (struct lnx_srx_context *) srx->ep_fid.fid.context; + + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_trecv, get_addr); + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_recv, get_addr); +} + +static int lnx_peer_map_addrs(struct dlist_entry *prov_table, + struct lnx_peer *lp, struct lnx_addresses *la, + uint64_t flags, void *context) +{ + int i, j, rc; + struct lnx_peer_prov *lpp; + struct lnx_address_prov *lap; + struct local_prov_ep *lpe; + struct dlist_entry *eps; + + lap = &la->la_addr_prov[0]; + + for (i = 0; i < la->la_prov_count; i++) { + if (lap->lap_addr_count > LNX_MAX_LOCAL_EPS) + return -FI_EPROTO; + + rc = lnx_get_or_create_peer_prov(prov_table, lp, lap->lap_prov, + &lpp); + if (rc) + return rc; + + lpp->lpp_flags = flags; + + eps = &lpp->lpp_prov->lpv_prov_eps; + dlist_foreach_container(eps, struct local_prov_ep, lpe, + entry) { + struct lnx_local2peer_map *lpm; + + /* if this is a remote peer, don't insert the shm address + * since we will never talk to that peer over shm + */ + if (!strncasecmp(lpe->lpe_fabric_name, "shm", 3) && + !lp->lp_local) + continue; + + lpm = calloc(sizeof(*lpm), 1); + if (!lpm) + return -FI_ENOMEM; + + dlist_init(&lpm->entry); + dlist_insert_tail(&lpm->entry, &lpp->lpp_map); + + lpm->local_ep = lpe; + lpm->addr_count = lap->lap_addr_count; + for (j = 0; j < LNX_MAX_LOCAL_EPS; j++) + lpm->peer_addrs[j] = FI_ADDR_NOTAVAIL; + /* fi_av_insert returns the number of addresses inserted */ + rc = fi_av_insert(lpe->lpe_av, (void*)lap->lap_addrs, + lap->lap_addr_count, + lpm->peer_addrs, flags, context); + if (rc < 0) + return rc; + + /* should only insert the number of addresses indicated */ + assert(rc == lap->lap_addr_count); + } + + lap = next_prov(lap); + } + + return 0; +} + +/* + * count: number of LNX addresses + * addr: an array of addresses + * fi_addr: an out array of fi_addr)t + * + * Each LNX address can have multiple core provider addresses + * Check the hostname provided in each address to see if it's the same as + * me. If so, then we'll use the SHM address if available. + * + * ASSUMPTION: fi_av_insert() is called exactly once per peer. + * We're not handling multiple av_inserts on the same peer. If that + * happens then we will create multiple peers entries. + */ +int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + int i, rc, idx; + int disable_shm = 0; + struct lnx_peer *lp; + struct dlist_entry *prov_table; + struct lnx_peer_table *peer_tbl; + struct lnx_addresses *la = (struct lnx_addresses *)addr; + + fi_param_get_bool(&lnx_prov, "disable_shm", &disable_shm); + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + prov_table = &peer_tbl->lpt_domain->ld_fabric->local_prov_table; + + /* each entry represents a separate peer */ + for (i = 0; i < count; i++) { + /* can't have more providers than LNX_MAX_LOCAL_EPS */ + if (la->la_prov_count >= LNX_MAX_LOCAL_EPS || + la->la_prov_count <= 0) + return -FI_EPROTO; + + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + lp = ofi_ibuf_alloc(peer_tbl->lpt_entries); + if (!lp) { + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); + return -FI_ENOMEM; + } + idx = ofi_buf_index(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); + + dlist_init(&lp->lp_provs); + + rc = is_local_addr(&peer_tbl->lpt_domain->ld_fabric->shm_prov, + la); + if (!rc) { + lp->lp_local = !disable_shm; + } else if (rc == -FI_EOPNOTSUPP) { + lp->lp_local = false; + } else if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to identify address\n"); + return rc; + } + + rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); + if (rc) { + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + ofi_ibuf_free(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); + return rc; + } + + if (flags & FI_AV_USER_ID) + lp->lp_fi_addr = fi_addr[i]; + else + lp->lp_fi_addr = idx; + + fi_addr[i] = idx; + + la = next_peer(la); + } + + return i; +} + +int lnx_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct lnx_peer_table *peer_tbl; + int frc = 0, rc, i; + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + + for (i = 0; i < count; i++) { + rc = lnx_peer_remove(peer_tbl, (int)fi_addr[i]); + if (rc) + frc = rc; + } + + return frc; +} + +static const char * +lnx_av_straddr(struct fid_av *av, const void *addr, + char *buf, size_t *len) +{ + /* TODO: implement */ + return NULL; +} + +static int +lnx_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + /* TODO: implement */ + return -FI_EOPNOTSUPP; +} + +static struct fi_ops_av lnx_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = lnx_av_insert, + .remove = lnx_av_remove, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .lookup = lnx_av_lookup, + .straddr = lnx_av_straddr, +}; + +static void lnx_get_core_av_attr(struct local_prov_ep *ep, + struct fi_av_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->type = ep->lpe_fi_info->domain_attr->av_type; +} + +static int lnx_open_avs(struct local_prov *prov, struct fi_av_attr *attr, + void *context) +{ + int rc = 0; + struct local_prov_ep *ep; + struct fi_av_attr core_attr; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + lnx_get_core_av_attr(ep, &core_attr); + if (ep->lpe_local) + core_attr.count = ep->lpe_fi_info->domain_attr->ep_cnt; + else + core_attr.count = attr->count; + rc = fi_av_open(ep->lpe_domain, &core_attr, + &ep->lpe_av, context); + if (rc) + return rc; + } + + return 0; +} + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context) +{ + struct lnx_fabric *fabric; + struct lnx_domain *lnx_domain; + struct lnx_peer_table *peer_tbl; + struct local_prov *entry; + size_t table_sz; + int rc = 0; + struct ofi_bufpool_attr pool_attr = { + .size = sizeof(struct lnx_peer), + .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED, + }; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->type != FI_AV_TABLE) + attr->type = FI_AV_TABLE; + + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + + peer_tbl = calloc(sizeof(*peer_tbl), 1); + if (!peer_tbl) + return -FI_ENOMEM; + + table_sz = attr->count ? attr->count : ofi_universe_size; + table_sz = roundup_power_of_two(table_sz); + pool_attr.chunk_cnt = table_sz; + + rc = ofi_bufpool_create_attr(&pool_attr, &peer_tbl->lpt_entries); + if (rc) { + rc = -FI_ENOMEM; + goto failed; + } + + rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, + &peer_tbl->lpt_av, context); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "failed to initialize AV: %d\n", rc); + goto failed; + } + + peer_tbl->lpt_max_count = table_sz; + peer_tbl->lpt_domain = lnx_domain; + peer_tbl->lpt_av.av_fid.fid.ops = &lnx_av_fi_ops; + peer_tbl->lpt_av.av_fid.ops = &lnx_av_ops; + + assert(fabric->lnx_peer_tbl == NULL); + + /* need this to handle memory registration vi fi_mr_regattr(). We need + * to be able to access the peer table to determine which endpoint + * we'll be using based on the source/destination address */ + fabric->lnx_peer_tbl = peer_tbl; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_avs(entry, attr, context); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close; + } + } + + *av = &peer_tbl->lpt_av.av_fid; + + return 0; + +close: + ofi_av_close_lightweight(&peer_tbl->lpt_av); +failed: + lnx_free_peer_tbl(peer_tbl); + return rc; +} + + diff --git a/prov/lnx/src/lnx_cq.c b/prov/lnx/src/lnx_cq.c new file mode 100644 index 00000000000..03b43a593e9 --- /dev/null +++ b/prov/lnx/src/lnx_cq.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +ssize_t lnx_peer_cq_write(struct fid_peer_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write(&lnx_cq->lpc_shared_cq->util_cq, context, + flags, len, buf, data, tag); + + return rc; +} + +ssize_t lnx_peer_cq_writeerr(struct fid_peer_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write_error(&lnx_cq->lpc_shared_cq->util_cq, err_entry); + + return rc; +} + +static int lnx_cleanup_cqs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_cq.lpc_core_cq->fid); + if (rc) + frc = rc; + ep->lpe_cq.lpc_core_cq = NULL; + } + + return frc; +} + +static int lnx_cq_close(struct fid *fid) +{ + int rc; + struct lnx_cq *lnx_cq; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(fid, struct lnx_cq, util_cq.cq_fid); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* close all the open core cqs */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_cqs(entry); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + return rc; + } + } + + rc = ofi_cq_cleanup(&lnx_cq->util_cq); + if (rc) + return rc; + + free(lnx_cq); + return 0; +} + +struct fi_ops_cq_owner lnx_cq_write = { + .size = sizeof(lnx_cq_write), + .write = lnx_peer_cq_write, + .writeerr = lnx_peer_cq_writeerr, +}; + +static struct fi_ops lnx_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_cq_close, + .bind = fi_no_bind, + .control = ofi_cq_control, + .ops_open = fi_no_ops_open, +}; + +static void lnx_cq_progress(struct util_cq *cq) +{ + struct lnx_cq *lnx_cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(cq, struct lnx_cq, util_cq); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* Kick the core provider endpoints to progress */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + fi_cq_read(ep->lpe_cq.lpc_core_cq, NULL, 0); + } +} + +static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) +{ + int rc; + struct local_prov_ep *ep; + struct local_prov *entry; + struct fi_cq_attr peer_attr = {0}; + struct dlist_entry *prov_table = + &cq->lnx_domain->ld_fabric->local_prov_table; + + /* tell the core providers to import my CQ */ + peer_attr.flags |= FI_PEER; + + /* create all the core provider completion queues */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + struct fid_cq *core_cq; + struct fi_peer_cq_context cq_ctxt; + + ep->lpe_cq.lpc_shared_cq = cq; + ep->lpe_cq.lpc_cq.owner_ops = &lnx_cq_write; + + cq_ctxt.size = sizeof(cq_ctxt); + cq_ctxt.cq = &ep->lpe_cq.lpc_cq; + + /* pass my CQ into the open and get back the core's cq */ + rc = fi_cq_open(ep->lpe_domain, &peer_attr, &core_cq, &cq_ctxt); + if (rc) + return rc; + + /* before the fi_cq_open() returns the core provider should + * have called fi_export_fid() and got a pointer to the peer + * CQ which we have allocated for this core provider + */ + + ep->lpe_cq.lpc_core_cq = core_cq; + } + } + + return 0; +} + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct lnx_cq *lnx_cq; + struct lnx_domain *lnx_dom; + int rc; + + lnx_cq = calloc(1, sizeof(*lnx_cq)); + if (!lnx_cq) + return -FI_ENOMEM; + + /* this is going to be a standard CQ from the read side. From the + * write side, it'll use the peer_cq callbacks to write + */ + rc = ofi_cq_init(&lnx_prov, domain, attr, &lnx_cq->util_cq, + &lnx_cq_progress, context); + if (rc) + goto free; + + lnx_dom = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + + lnx_cq->lnx_domain = lnx_dom; + lnx_cq->util_cq.cq_fid.fid.ops = &lnx_cq_fi_ops; + (*cq_fid) = &lnx_cq->util_cq.cq_fid; + + /* open core CQs and tell them to import my CQ */ + rc = lnx_cq_open_core_prov(lnx_cq, attr); + + return rc; + +free: + free(lnx_cq); + return rc; +} diff --git a/prov/lnx/src/lnx_domain.c b/prov/lnx/src/lnx_domain.c new file mode 100644 index 00000000000..f1b055f4a88 --- /dev/null +++ b/prov/lnx/src/lnx_domain.c @@ -0,0 +1,587 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static struct fi_ops_domain lnx_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = lnx_av_open, + .cq_open = lnx_cq_open, + .endpoint = lnx_endpoint, + .scalable_ep = lnx_scalable_ep, + .cntr_open = fi_no_cntr_open, + .poll_open = fi_no_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, +}; + +static int lnx_cleanup_domains(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (!ep->lpe_domain) + continue; + + rc = fi_close(&ep->lpe_srx_ep->fid); + if (rc) + frc = rc; + + rc = fi_close(&ep->lpe_domain->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static int lnx_domain_close(fid_t fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *domain; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + + /* close all the open core domains */ + dlist_foreach_container(&domain->ld_fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_domains(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + } + + ofi_mr_cache_cleanup(&domain->ld_mr_cache); + + rc = ofi_domain_close(&domain->ld_domain); + + free(domain); + + return rc; +} + +static int +lnx_mr_regattrs_all(struct local_prov *prov, const struct fi_mr_attr *attr, + uint64_t flags, struct lnx_mem_desc_prov *desc) +{ + int rc = 0; + struct local_prov_ep *ep; + + desc->prov = prov; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_regattr(ep->lpe_domain, attr, + flags, &desc->core_mr); + + /* TODO: SHM provider returns FI_ENOKEY if requested_key is the + * same as the previous call. Application, like OMPI, might not + * specify the requested key in fi_mr_attr, so for now ignore that + * error. + * We need a better way of handling this. + * if (rc == -FI_ENOKEY) + * rc = 0; + * I made a change in SHM to support FI_MR_PROV_KEY if set by the + * application. This tells ofi to generate its own requested_key + * for each fi_mr_regattr call + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_regattr() failed: %d\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + return rc; +} + +static int +lnx_mr_close_all(struct lnx_mem_desc *mem_desc) +{ + int i, rc, frc = 0; + struct fid_mr *mr; + + for (i = 0; i < mem_desc->desc_count; i++) { + mr = mem_desc->desc[i].core_mr; + if (!mr) + continue; + rc = fi_close(&mr->fid); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_close() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +int lnx_mr_close(struct fid *fid) +{ + struct lnx_mr *lnx_mr; + struct ofi_mr *mr; + int rc, frc = 0; + + mr = container_of(fid, struct ofi_mr, mr_fid.fid); + lnx_mr = container_of(mr, struct lnx_mr, mr); + + rc = lnx_mr_close_all(mr->mr_fid.mem_desc); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to complete Memory Deregistration\n"); + frc = rc; + } + + ofi_atomic_dec32(&mr->domain->ref); + + ofi_buf_free(lnx_mr); + + return frc; +} + +static int lnx_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int i, rc, frc = 0; + struct local_prov_ep *ep; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + dlist_foreach_container(&desc->prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_bind(cmr, &ep->lpe_ep->fid, flags); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s lnx_mr_bind() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + } + + return frc; +} + +static int lnx_mr_control(struct fid *fid, int command, void *arg) +{ + int i, rc, frc = 0; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + if (command != FI_ENABLE) + return -FI_ENOSYS; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + rc = fi_mr_enable(cmr); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s lnx_mr_control() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +static struct fi_ops lnx_mr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_mr_close, + .bind = lnx_mr_bind, + .control = lnx_mr_control, + .ops_open = fi_no_ops_open +}; + +static int +lnx_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) +{ + /* + * If the address is specified then use it to find out which + * domain to register the memory against. LNX can be managing + * multiple underlying core provider endpoints, I need to register the + * memory against the correct one. + * + * Once the domain is determined, I need to set the mr->mem_desc to + * point to a structure which contains my local endpoint I'll end up + * using (which is the same one that I registered the memory against) + * and the associate fid_mr which the core provider set for me. + * + * I return that to the application. + * + * When the application calls back into the data operations API it'll + * pass the mr. I can then pull out a pointer to my local endpoint + * which I'll use in the data operation and pass it the correct mr. + * + * If the address is not provided, then I'll register the memory + * buffer against all my core domains, store those and return them to + * the user + */ + + struct lnx_domain *domain; + struct lnx_fabric *fabric; + struct lnx_mr *lnx_mr = NULL;; + struct ofi_mr *mr; + struct lnx_mem_desc *mem_desc; + struct local_prov *entry; + int rc = 0, i = 1; + bool shm = false; + + if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0) + return -FI_EINVAL; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + fabric = domain->ld_fabric; + + lnx_mr = ofi_buf_alloc(fabric->mem_reg_bp); + if (!lnx_mr) { + rc = -FI_ENOMEM; + goto fail; + } + + mr = &lnx_mr->mr; + mem_desc = &lnx_mr->desc; + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr->context; + mr->mr_fid.fid.ops = &lnx_mr_fi_ops; + mr->mr_fid.mem_desc = mem_desc; + mr->domain = &domain->ld_domain; + mr->flags = flags; + + /* TODO: What's gonna happen if you try to register the same piece + * of memory via multiple providers? + * TODO 2: We need a better way to handle memory registration. + * This is simply not very good. We need to have a peer interface + * to memory registration + */ + /* register against all domains */ + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + if (!strcmp(entry->lpv_prov_name, "shm")) + shm = true; + else + shm = false; + if (i >= LNX_MAX_LOCAL_EPS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Exceeded number of allowed memory registrations %s\n", + entry->lpv_prov_name); + rc = -FI_ENOSPC; + goto fail; + } + rc = lnx_mr_regattrs_all(entry, attr, flags, + (shm) ? &mem_desc->desc[0] : + &mem_desc->desc[i]); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to complete Memory Registration %s\n", + entry->lpv_prov_name); + goto fail; + } + if (!shm) + i++; + } + + mem_desc->desc_count = i; + if (shm) + mr->mr_fid.key = mem_desc->desc[0].core_mr->key; + else + mr->mr_fid.key = mem_desc->desc[1].core_mr->key; + *mr_fid = &mr->mr_fid; + ofi_atomic_inc32(&domain->ld_domain.ref); + + return 0; + +fail: + if (lnx_mr) + ofi_buf_free(lnx_mr); + return rc; +} + +static struct fi_ops lnx_domain_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_domain_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_mr lnx_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = fi_no_mr_reg, + .regv = fi_no_mr_regv, + .regattr = lnx_mr_regattr, +}; + +static int lnx_setup_core_domain(struct local_prov_ep *ep, struct fi_info *info) +{ + struct fi_info *fi, *itr; + + fi = lnx_get_link_by_dom(info->domain_attr->name); + if (!fi) + return -FI_ENODATA; + + for (itr = fi; itr; itr = itr->next) { + if (!strcmp(itr->fabric_attr->name, ep->lpe_fabric_name)) { + ep->lpe_fi_info = fi_dupinfo(itr); + return FI_SUCCESS; + } + } + + ep->lpe_fi_info = NULL; + + return -FI_ENOENT; +} + +static struct fi_ops_srx_owner lnx_srx_ops = { + .size = sizeof(struct fi_ops_srx_owner), + .get_msg = lnx_get_msg, + .get_tag = lnx_get_tag, + .queue_msg = lnx_queue_msg, + .queue_tag = lnx_queue_tag, + .free_entry = lnx_free_entry, + .foreach_unspec_addr = lnx_foreach_unspec_addr, +}; + +static int lnx_open_core_domains(struct local_prov *prov, + void *context, struct lnx_domain *lnx_domain, + struct fi_info *info) +{ + int rc; + struct local_prov_ep *ep; + struct fi_rx_attr attr = {0}; + struct fi_peer_srx_context peer_srx; + struct dlist_entry *tmp; + int srq_support = 1; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + attr.op_flags = FI_PEER; + peer_srx.size = sizeof(peer_srx); + + if (srq_support) + lnx_domain->ld_srx_supported = true; + else + lnx_domain->ld_srx_supported = false; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + /* the fi_info we setup when we created the fabric might not + * necessarily be the correct one. It'll have the same fabric + * information, since the fabric information is common among all + * the domains the provider manages. However at this point we need + * to get the fi_info that the application is requesting */ + rc = lnx_setup_core_domain(ep, info); + if (rc) + return rc; + + if (srq_support) { + /* special case for CXI provider. We need to turn off tag + * matching HW offload if we're going to support shared + * receive queues. + */ + if (strstr(ep->lpe_fabric_name, "cxi")) + setenv("FI_CXI_RX_MATCH_MODE", "software", 1); + } + + rc = fi_domain(ep->lpe_fabric, ep->lpe_fi_info, + &ep->lpe_domain, context); + + if (!rc && srq_support) { + ep->lpe_srx.owner_ops = &lnx_srx_ops; + peer_srx.srx = &ep->lpe_srx; + rc = fi_srx_context(ep->lpe_domain, &attr, + &ep->lpe_srx_ep, &peer_srx); + } + + /* if one of the constituent endpoints doesn't support shared + * receive context, then fail, as we can't continue with this + * inconsistency + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s does not support shared" + " receive queues. Failing\n", ep->lpe_fabric_name); + return rc; + } + } + + return 0; +} + +static int lnx_addr_add_region_noop(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + return FI_SUCCESS; +} + +static void lnx_addr_del_region(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + struct ofi_mr *mr = (struct ofi_mr *)entry->data; + + ofi_hmem_dev_unregister(mr->iface, (uint64_t) mr->hmem_data); +} + +/* + * provider: shm+cxi:lnx + * fabric: ofi_lnx_fabric + * domain: shm+cxi3:ofi_lnx_domain + * version: 120.0 + * type: FI_EP_RDM + * protocol: FI_PROTO_LNX + * + * Parse out the provider name. It should be shm+ + * + * Create a fabric for shm and one for the other provider. + * + * When fi_domain() is called, we get the fi_info for the + * second provider, which we should've returned as part of the + * fi_getinfo() call. + */ +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *lnx_domain; + struct util_domain *lnx_domain_info; + struct lnx_fabric *lnx_fab = container_of(fabric, struct lnx_fabric, + util_fabric.fabric_fid); + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + [FI_HMEM_ZE] = default_ze_monitor, + }; + + /* create a new entry for shm. + * Create its fabric. + * insert fabric in the global table + */ + rc = lnx_setup_core_fabrics(info->domain_attr->name, lnx_fab, context); + if (rc) + goto fail; + + rc = -FI_ENOMEM; + lnx_domain = calloc(sizeof(*lnx_domain), 1); + if (!lnx_domain) + goto fail; + + lnx_domain_info = &lnx_domain->ld_domain; + lnx_domain->ld_fabric = lnx_fab; + + rc = ofi_domain_init(fabric, info, lnx_domain_info, context, + OFI_LOCK_SPINLOCK); + if (rc) + goto fail; + + dlist_foreach_container(&lnx_domain->ld_fabric->local_prov_table, + struct local_prov, entry, lpv_entry) { + rc = lnx_open_core_domains(entry, context, lnx_domain, info); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close_domain; + } + } + + lnx_domain_info->domain_fid.fid.ops = &lnx_domain_fi_ops; + lnx_domain_info->domain_fid.ops = &lnx_domain_ops; + lnx_domain_info->domain_fid.mr = &lnx_mr_ops; + + lnx_domain->ld_mr_cache.add_region = lnx_addr_add_region_noop; + lnx_domain->ld_mr_cache.delete_region = lnx_addr_del_region; + lnx_domain->ld_mr_cache.entry_data_size = sizeof(struct ofi_mr); + rc = ofi_mr_cache_init(&lnx_domain->ld_domain, memory_monitors, + &lnx_domain->ld_mr_cache); + if (rc) + goto close_domain; + + *domain = &lnx_domain_info->domain_fid; + + return 0; + +close_domain: + lnx_domain_close(&(lnx_domain_info->domain_fid.fid)); +fail: + return rc; +} + diff --git a/prov/lnx/src/lnx_ep.c b/prov/lnx/src/lnx_ep.c new file mode 100644 index 00000000000..6590a6056d9 --- /dev/null +++ b/prov/lnx/src/lnx_ep.c @@ -0,0 +1,1181 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +extern struct fi_ops_cm lnx_cm_ops; +extern struct fi_ops_msg lnx_msg_ops; +extern struct fi_ops_tagged lnx_tagged_ops; +extern struct fi_ops_rma lnx_rma_ops; +extern struct fi_ops_atomic lnx_atomic_ops; + +static void lnx_init_ctx(struct fid_ep *ctx, size_t fclass); + +static int lnx_close_ceps(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + + if (ep->lpe_srx.ep_fid.fid.context) + free(ep->lpe_srx.ep_fid.fid.context); + + rc = fi_close(&ep->lpe_ep->fid); + if (rc) + frc = rc; + ofi_bufpool_destroy(ep->lpe_recv_bp); + } + + return frc; +} + +int lnx_ep_close(struct fid *fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_ep *ep; + struct lnx_fabric *fabric; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = ep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + lnx_close_ceps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close endpoint for %s\n", + entry->lpv_prov_name); + } + + ofi_endpoint_close(&ep->le_ep); + free(ep); + + return rc; +} + +static int lnx_enable_core_eps(struct lnx_ep *lep) +{ + int rc; + struct local_prov *entry; + struct local_prov_ep *ep; + int srq_support = 1; + struct lnx_fabric *fabric = lep->le_domain->ld_fabric; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (srq_support) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_srx_ep->fid, 0); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, + "%s doesn't support SRX (%d)\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + rc = fi_enable(ep->lpe_ep); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_control(struct fid *fid, int command, void *arg) +{ + struct lnx_ep *ep; + int rc; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + + switch (command) { + case FI_ENABLE: + if (ep->le_fclass == FI_CLASS_EP && + ((ofi_needs_rx(ep->le_ep.caps) && !ep->le_ep.rx_cq) || + (ofi_needs_tx(ep->le_ep.caps) && !ep->le_ep.tx_cq))) + return -FI_ENOCQ; + if (!ep->le_peer_tbl) + return -FI_ENOAV; + rc = lnx_enable_core_eps(ep); + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +int lnx_cq_bind_core_prov(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct util_cq *cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + cq = container_of(bfid, struct util_cq, cq_fid.fid); + fabric = lep->le_domain->ld_fabric; + + rc = ofi_ep_bind_cq(&lep->le_ep, cq, flags); + if (rc) + return rc; + + /* bind the core providers to their respective CQs */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_cq.lpc_core_cq->fid, flags); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_bind_core_prov(struct lnx_fabric *fabric, uint64_t flags) +{ + struct local_prov *entry; + struct local_prov_ep *ep; + int rc; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, &ep->lpe_av->fid, flags); + if (rc) + return rc; + } + } + + return rc; +} + +static int +lnx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc = 0; + struct lnx_ep *ep; + struct lnx_peer_table *peer_tbl; + + switch (fid->fclass) { + case FI_CLASS_EP: /* Standard EP */ + case FI_CLASS_SEP: /* Scalable EP */ + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + + default: + return -FI_EINVAL; + } + + switch (bfid->fclass) { + case FI_CLASS_EQ: + return -FI_ENOSYS; + + case FI_CLASS_CQ: + rc = lnx_cq_bind_core_prov(fid, bfid, flags); + break; + + case FI_CLASS_CNTR: + return -FI_ENOSYS; + + case FI_CLASS_AV: + peer_tbl = container_of(bfid, struct lnx_peer_table, + lpt_av.av_fid.fid); + if (peer_tbl->lpt_domain != ep->le_domain) + return -FI_EINVAL; + ep->le_peer_tbl = peer_tbl; + /* forward the bind to the core provider endpoints */ + rc = lnx_ep_bind_core_prov(ep->le_domain->ld_fabric, flags); + break; + + case FI_CLASS_STX_CTX: /* shared TX context */ + return -FI_ENOSYS; + + case FI_CLASS_SRX_CTX: /* shared RX context */ + return -FI_ENOSYS; + + default: + return -FI_EINVAL; + } + + return rc; +} + +int lnx_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct local_prov *entry; + size_t size = sizeof(struct lnx_addresses); + /* initial location to put the address */ + char ep_addr[FI_NAME_MAX]; + char *tmp = NULL; + struct lnx_addresses *la; + struct lnx_address_prov *lap; + char hostname[FI_NAME_MAX]; + size_t prov_addrlen; + size_t addrlen_list[LNX_MAX_LOCAL_EPS]; + int rc, j = 0; + struct lnx_ep *lnx_ep; + struct lnx_fabric *fabric; + struct local_prov_ep *ep; + + lnx_ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lnx_ep->le_domain->ld_fabric; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + addrlen_list[0] = 0; + + /* calculate the size of the address */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + size += sizeof(struct lnx_address_prov); + prov_addrlen = 0; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_getname(&ep->lpe_ep->fid, (void*)ep_addr, &prov_addrlen); + if (rc == -FI_ETOOSMALL) { + size += prov_addrlen * entry->lpv_ep_count; + addrlen_list[j] = prov_addrlen; + j++; + break; + } else { + return -FI_EINVAL; + } + } + } + + if (!addr || *addrlen < size) { + *addrlen = size; + return -FI_ETOOSMALL; + } + + la = addr; + + lap = (struct lnx_address_prov *)((char*)la + sizeof(*la)); + + j = 0; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + memcpy(lap->lap_prov, entry->lpv_prov_name, FI_NAME_MAX - 1); + lap->lap_addr_count = entry->lpv_ep_count; + lap->lap_addr_size = addrlen_list[j]; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + tmp = (char*)lap + sizeof(*lap); + + rc = fi_getname(&ep->lpe_ep->fid, (void*)tmp, &addrlen_list[j]); + if (rc) + return rc; + + if (lap->lap_addr_size != addrlen_list[j]) + return -FI_EINVAL; + + tmp += addrlen_list[j]; + } + + lap = (struct lnx_address_prov *)tmp; + j++; + } + + la->la_prov_count = j; + memcpy(la->la_hostname, hostname, FI_NAME_MAX - 1); + + return 0; +} + +static ssize_t lnx_ep_cancel(fid_t fid, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + switch (fid->fclass) { + case FI_CLASS_EP: + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + case FI_CLASS_RX_CTX: + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + break; + case FI_CLASS_TX_CTX: + return -FI_ENOENT; + default: + return -FI_EINVAL; + } + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_cancel(&ep->lpe_ep->fid, context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + +static int lnx_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + int rc = 0; + struct lnx_ep *lep; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_setopt(&ep->lpe_ep->fid, level, optname, + optval, optlen); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + + +static int lnx_ep_txc(struct fid_ep *fid, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->tx_ctx_cnt) + continue; + + rc = fi_tx_context(ep->lpe_ep, index, attr, + &ep->lpe_txc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_TX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_tx_ctx); + /* set the callbacks for the transmit context */ + *tx_ep = &ctx->ctx_ep; + + return rc; +} + +static int lnx_ep_rxc(struct fid_ep *fid, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->rx_ctx_cnt) + continue; + + rc = fi_rx_context(ep->lpe_ep, index, attr, + &ep->lpe_rxc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_RX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_rx_ctx); + /* set the callbacks for the receive context */ + *rx_ep = &ctx->ctx_ep; + + return rc; +} + +struct fi_ops_ep lnx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + /* can't get opt, because there is no way to report multiple + * options for the different links */ + .getopt = fi_no_getopt, + .setopt = lnx_ep_setopt, + .tx_ctx = lnx_ep_txc, + .rx_ctx = lnx_ep_rxc, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +struct fi_ops lnx_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ep_close, + .bind = lnx_ep_bind, + .control = lnx_ep_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_cm lnx_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = lnx_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, +}; + +static int lnx_open_eps(struct local_prov *prov, struct fi_info *info, + void *context, size_t fclass, struct lnx_ep *lep) +{ + int rc = 0; + struct local_prov_ep *ep; + struct dlist_entry *tmp; + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_srx_context *ctxt; + + ctxt = calloc(1, sizeof(*ctxt)); + if (!ctxt) + return -FI_ENOMEM; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + if (fclass == FI_CLASS_EP) { + rc = fi_endpoint(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } else { + /* update endpoint attributes with whatever is being + * passed from the application + */ + if (ep->lpe_fi_info && info) { + ep->lpe_fi_info->ep_attr->tx_ctx_cnt = + info->ep_attr->tx_ctx_cnt; + ep->lpe_fi_info->ep_attr->rx_ctx_cnt = + info->ep_attr->rx_ctx_cnt; + } + + ep->lpe_txc = calloc(info->ep_attr->tx_ctx_cnt, + sizeof(*ep->lpe_txc)); + ep->lpe_rxc = calloc(info->ep_attr->rx_ctx_cnt, + sizeof(*ep->lpe_rxc)); + if (!ep->lpe_txc || !ep->lpe_rxc) + return -FI_ENOMEM; + + rc = fi_scalable_ep(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } + if (rc) + return rc; + + ctxt->srx_lep = lep; + ctxt->srx_cep = ep; + + ep->lpe_srx.ep_fid.fid.context = ctxt; + ep->lpe_srx.ep_fid.fid.fclass = FI_CLASS_SRX_CTX; + ofi_spin_init(&ep->lpe_bplock); + /* create a buffer pool for the receive requests */ + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &ep->lpe_recv_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return -FI_ENOMEM; + } + } + + return 0; +} + +static void +lnx_ep_nosys_progress(struct util_ep *util_ep) +{ + assert(0); +} + +static inline int +match_tag(uint64_t tag, uint64_t match_tag, uint64_t ignore) +{ + return ((tag | ignore) == (match_tag | ignore)); +} + +static inline bool +lnx_addr_match(fi_addr_t addr1, fi_addr_t addr2) +{ + return (addr1 == addr2); +} + +static inline bool +lnx_search_addr_match(fi_addr_t cep_addr, struct lnx_peer_prov *lpp) +{ + struct lnx_local2peer_map *lpm; + fi_addr_t peer_addr; + int i; + + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, + lpm, entry) { + for (i = 0; i < LNX_MAX_LOCAL_EPS; i++) { + peer_addr = lpm->peer_addrs[i]; + if (peer_addr == FI_ADDR_NOTAVAIL) + break; + if (lnx_addr_match(peer_addr, cep_addr)) + return true; + } + } + + return false; +} + +static int lnx_match_common(uint64_t tag1, uint64_t tag2, uint64_t ignore, + fi_addr_t cep_addr, fi_addr_t lnx_addr, struct lnx_peer *peer, + struct local_prov_ep *cep) +{ + struct lnx_peer_prov *lpp; + struct local_prov *lp; + bool tmatch; + + /* if a request has no address specified it'll match against any + * rx_entry with a matching tag + * or + * if an rx_entry has no address specified, it'll match against any + * request with a matching tag + * + * for non tagged messages tags will be set to TAG_ANY so they will + * always match and decision will be made on address only. + */ + tmatch = match_tag(tag1, tag2, ignore); + if (!tmatch) + return tmatch; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "tag1=%lx tag2=%lx ignore=%lx cep_addr=%lx lnx_addr=%lx tmatch=%d\n", + tag1, tag2, ignore, cep_addr, lnx_addr, tmatch); + + /* if we're requested to receive from any peer, then tag maching is + * enough. None tagged message will match irregardless. + */ + if (lnx_addr == FI_ADDR_UNSPEC) + return tmatch; + + /* if the address is specified, then we should have a peer and + * a receiving core endpoint and a provider parent + */ + assert(peer && cep && cep->lpe_parent); + + lp = cep->lpe_parent; + + /* if this is a shm core provider, then only go through lnx + * shm provider + */ + if (cep->lpe_local) + return lnx_search_addr_match(cep_addr, peer->lp_shm_prov); + + /* check if we already have a peer provider. + * A peer can receive messages from multiple providers, we need to + * find the provider which maps to the provider we're currently + * checking. The map looked up can have multiple addresses which + * we can receive from, so we need to check which one of those is + * the correct match. + * + * Note: we're trying to make this loop as efficient as possible, + * because it's executed on the message matching path, which is + * heavily hit. + * + * The theory is in most use cases: + * - There will be only two providers to check + * - Each provider will have 1 endpoint, and therefore only one map + * - Each peer will only have 1 address. + * + */ + dlist_foreach_container(&peer->lp_provs, + struct lnx_peer_prov, lpp, entry) { + if (lpp->lpp_prov == lp) + return lnx_search_addr_match(cep_addr, lpp); + } + + return false; +} + +static int lnx_match_unexq(struct dlist_entry *item, const void *args) +{ + /* this entry is placed on the SUQ via the lnx_get_tag() path + * and examined in the lnx_process_tag() path */ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + struct lnx_peer *peer = match_attr->lm_peer; + + /* entry refers to the unexpected message received + * entry->rx_entry.tag will be the tag of the message or TAG_UNSPEC + * otherwise + * + * entry->rx_entry.addr will be the address of the peer which sent the + * message or ADDR_UNSPEC if the core provider didn't do a reverse + * lookup. + * + * entry->rx_cep will be set to the core endpoint which received the + * message. + * + * match_attr is filled in by the lnx_process_tag() and contains + * information passed to us by the application + * + * match_attr->lm_peer is the peer looked up via the addr passed by + * the application to LNX. It is NULL if the addr is ADDR_UNSPEC. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed by the application to LNX via the receive API. + * + * match_attr->lm_addr is the only significant if it's set to + * FI_ADDR_UNSPEC, otherwise it's not used in matching because it's + * the LNX level address and we need to compare the core level address. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + match_attr->lm_ignore, entry->rx_entry.addr, + match_attr->lm_addr, peer, entry->rx_cep); +} + +static int lnx_match_recvq(struct dlist_entry *item, const void *args) +{ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + /* this entry is placed on the recvq via the lnx_process_tag() path + * and examined in the lnx_get_tag() path */ + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + + /* entry refers to the receive request waiting for a message + * entry->rx_entry.tag is the tag passed in by the application. + * + * entry->rx_entry.addr is the address passed in by the application. + * This is the LNX level address. It's only significant if it's set + * to ADDR_UNSPEC. Otherwise, it has already been used to look up the + * peer. + * + * entry->rx_cep is always NULL in this case, as this will only be + * known when the message is received. + * + * entry->rx_peer is the LNX peer looked up if a valid address is + * given by the application, otherwise it's NULL. + * + * match_attr information is filled by the lnx_get_tag() callback and + * contains information passed to us by the core endpoint receiving + * the message. + * + * match_attr->rx_peer is not significant because at the lnx_get_tag() + * call there isn't enough information to find what the peer is. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed up by the core endpoint receiving the message. + * + * match_attr->lm_addr is the address of the peer which sent the + * message. Set if the core endpoint has done a reverse lookup, + * otherwise set to ADDR_UNSPEC. + * + * match_attr->lm_cep is the core endpoint which received the message. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + entry->rx_ignore, match_attr->lm_addr, + entry->rx_entry.addr, entry->rx_peer, match_attr->lm_cep); +} + +static inline int +lnx_init_queue(struct lnx_queue *q, dlist_func_t *match_func) +{ + int rc; + + rc = ofi_spin_init(&q->lq_qlock); + if (rc) + return rc; + + dlist_init(&q->lq_queue); + + q->lq_match_func = match_func; + + return 0; +} + +static inline int +lnx_init_qpair(struct lnx_qpair *qpair, dlist_func_t *recvq_match_func, + dlist_func_t *unexq_match_func) +{ + int rc = 0; + + rc = lnx_init_queue(&qpair->lqp_recvq, recvq_match_func); + if (rc) + goto out; + rc = lnx_init_queue(&qpair->lqp_unexq, unexq_match_func); + if (rc) + goto out; + +out: + return rc; +} + +static inline int +lnx_init_srq(struct lnx_peer_srq *srq) +{ + int rc; + + rc = lnx_init_qpair(&srq->lps_trecv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + rc = lnx_init_qpair(&srq->lps_recv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + + return rc; +} + +static int lnx_get_ctx(struct local_prov_ep *ep, size_t fclass, + struct fid_ep ***ep_ctx, size_t *size) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + *ep_ctx = ep->lpe_rxc; + *size = ep->lpe_fi_info->ep_attr->rx_ctx_cnt; + break; + case FI_CLASS_TX_CTX: + *ep_ctx = ep->lpe_txc; + *size = ep->lpe_fi_info->ep_attr->tx_ctx_cnt; + break; + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static void lnx_close_ep_ctx(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return; + + for (i = 0; i < size; i++) { + rc = fi_close(&ep_ctx[i]->fid); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close ep context %lu with %d\n", + fclass, rc); + } +} + +static int lnx_ctx_close(struct fid *fid) +{ + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + lnx_close_ep_ctx(ep, fid->fclass); + } + + return FI_SUCCESS; +} + +static int lnx_ctx_bind_cq(struct local_prov_ep *ep, size_t fclass, + struct fid *bfid, uint64_t flags) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_ep_bind(ep_ctx[i], bfid, flags); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (bfid->fclass == FI_CLASS_CQ) + /* bind the context to the shared cq */ + rc = lnx_ctx_bind_cq(ep, fid->fclass, + &ep->lpe_cq.lpc_core_cq->fid, + flags); + else + return -FI_ENOSYS; + + if (rc) + return rc; + } + } + + return FI_SUCCESS; +} + +static int +lnx_enable_ctx_eps(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_enable(ep_ctx[i]); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_control(struct fid *fid, int command, void *arg) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + switch (command) { + case FI_ENABLE: + if (!lep->le_peer_tbl) + return -FI_ENOAV; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = lnx_enable_ctx_eps(ep, fid->fclass); + if (rc) + return rc; + } + } + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +static struct fi_ops lnx_ctx_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ctx_close, + .bind = lnx_ctx_bind, + .control = lnx_ctx_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_ep lnx_ctx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static void +lnx_init_ctx(struct fid_ep *ctx, size_t fclass) +{ + ctx->fid.fclass = fclass; + ctx->fid.ops = &lnx_ctx_ops; + ctx->ops = &lnx_ctx_ep_ops; + ctx->msg = &lnx_msg_ops; + ctx->tagged = &lnx_tagged_ops; + ctx->rma = &lnx_rma_ops; + ctx->atomic = &lnx_atomic_ops; +} + +static int +lnx_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct lnx_ep **out_ep, void *context, size_t fclass) +{ + int rc; + struct lnx_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + uint64_t mr_mode; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + ep->le_fclass = fclass; + ep->le_ep.ep_fid.fid.fclass = fclass; + + ep->le_ep.ep_fid.fid.ops = &lnx_ep_fi_ops; + ep->le_ep.ep_fid.ops = &lnx_ep_ops; + ep->le_ep.ep_fid.cm = &lnx_cm_ops; + ep->le_ep.ep_fid.msg = &lnx_msg_ops; + ep->le_ep.ep_fid.tagged = &lnx_tagged_ops; + ep->le_ep.ep_fid.rma = &lnx_rma_ops; + ep->le_ep.ep_fid.atomic = &lnx_atomic_ops; + ep->le_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + lnx_init_srq(&ep->le_srq); + + dlist_init(&ep->le_rx_ctx); + dlist_init(&ep->le_tx_ctx); + + fabric = ep->le_domain->ld_fabric; + + /* create all the core provider endpoints */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_eps(entry, info, context, fclass, ep); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to create ep for %s\n", + entry->lpv_prov_name); + goto fail; + } + } + + mr_mode = lnx_util_prov.info->domain_attr->mr_mode; + lnx_util_prov.info->domain_attr->mr_mode = 0; + rc = ofi_endpoint_init(domain, (const struct util_prov *)&lnx_util_prov, + (struct fi_info *)lnx_util_prov.info, &ep->le_ep, + context, lnx_ep_nosys_progress); + if (rc) + goto fail; + + lnx_util_prov.info->domain_attr->mr_mode = mr_mode; + *out_ep = ep; + + return 0; + +fail: + free(ep); + return rc; +} + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_SEP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_EP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + + diff --git a/prov/lnx/src/lnx_init.c b/prov/lnx/src/lnx_init.c new file mode 100644 index 00000000000..d1377a0dd9d --- /dev/null +++ b/prov/lnx/src/lnx_init.c @@ -0,0 +1,885 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +#define LNX_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | \ + FI_DELIVERY_COMPLETE) +#define LNX_PASSTHRU_RX_OP_FLAGS (0ULL) +#define LNX_TX_OP_FLAGS (FI_INJECT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_TRANSMIT_COMPLETE) +#define LNX_RX_OP_FLAGS (FI_COMPLETION) + +ofi_spin_t global_bplock; +struct ofi_bufpool *global_recv_bp = NULL; + +struct util_fabric lnx_fabric_info; + +struct fi_tx_attr lnx_tx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_TX_OP_FLAGS | LNX_TX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .inject_size = SIZE_MAX, + .size = SIZE_MAX, + .iov_limit = LNX_IOV_LIMIT, + .rma_iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_rx_attr lnx_rx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_RX_OP_FLAGS | LNX_RX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .total_buffered_recv = 0, + .size = 1024, + .iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_ep_attr lnx_ep_attr = { + .type = FI_EP_UNSPEC, + .protocol = FI_PROTO_LNX, + .protocol_version = 1, + .max_msg_size = SIZE_MAX, + .msg_prefix_size = SIZE_MAX, + .max_order_raw_size = SIZE_MAX, + .max_order_war_size = SIZE_MAX, + .max_order_waw_size = SIZE_MAX, + .mem_tag_format = FI_TAG_GENERIC, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .auth_key = NULL, + .auth_key_size = 0, +}; + +struct fi_domain_attr lnx_domain_attr = { + .name = "ofi_lnx_domain", + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_TABLE, + .mr_mode = FI_MR_RAW, + .mr_key_size = SIZE_MAX, + .cq_data_size = SIZE_MAX, + .cq_cnt = SIZE_MAX, + .ep_cnt = SIZE_MAX, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .max_ep_tx_ctx = SIZE_MAX, + .max_ep_rx_ctx = SIZE_MAX, + .max_ep_stx_ctx = SIZE_MAX, + .max_ep_srx_ctx = SIZE_MAX, + .cntr_cnt = SIZE_MAX, + .mr_iov_limit = SIZE_MAX, + .caps = ~0x0ULL, + .auth_key_size = SIZE_MAX, + .max_err_data = SIZE_MAX, + .mr_cnt = SIZE_MAX, +}; + +struct fi_fabric_attr lnx_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .name = "ofi_lnx_fabric", +}; + +struct fi_info lnx_info = { + .caps = ~0x0ULL, + .tx_attr = &lnx_tx_attr, + .rx_attr = &lnx_rx_attr, + .ep_attr = &lnx_ep_attr, + .domain_attr = &lnx_domain_attr, + .fabric_attr = &lnx_fabric_attr +}; + +static struct fi_ops lnx_fabric_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_fabric lnx_fabric_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = lnx_domain_open, + .passive_ep = fi_no_passive_ep, + .eq_open = fi_no_eq_open, + .wait_open = fi_no_wait_open, + .trywait = fi_no_trywait +}; + +struct fi_provider lnx_prov = { + .name = OFI_LNX, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = lnx_getinfo, + .fabric = lnx_fabric, + .cleanup = lnx_fini +}; + +struct util_prov lnx_util_prov = { + .prov = &lnx_prov, + .info = &lnx_info, + .flags = 0 +}; + +/* + * For the fi_getinfo() -> fi_fabric() -> fi_domain() path, we need to + * keep track of the fi_info in case we need them later on when linking in + * the fi_fabric() function. + * + * This cache gets cleared after we use the ones we need, or when the + * library exists, if LNX is never used. + */ +struct dlist_entry lnx_fi_info_cache; +/* this is a list of all possible links */ +struct dlist_entry lnx_links; +struct dlist_entry lnx_links_meta; + +struct lnx_fi_cache_entry { + struct dlist_entry entry; + struct fi_info *fi; +}; + +struct lnx_fi_info_meta { + struct dlist_entry entry; + struct fi_info *lnx_rep; + struct fi_info *lnx_link; +}; + +static int lnx_get_cache_meta(struct dlist_entry *head, int *size) +{ + int num_prov = 0; + struct dlist_entry *e; + + dlist_foreach(head, e) + num_prov++; + + *size = num_prov; + + return FI_SUCCESS; +} + +static void lnx_free_meta(void) +{ + struct lnx_fi_info_meta *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry, tmp) { + dlist_remove(&e->entry); + free(e); + } +} + +static void lnx_free_info_cache(struct dlist_entry *head, bool meta) +{ + struct lnx_fi_cache_entry *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(head, struct lnx_fi_cache_entry, e, + entry, tmp) { + fi_freeinfo(e->fi); + dlist_remove(&e->entry); + free(e); + } + + if (meta) + lnx_free_meta(); +} + +static int lnx_cache_info(struct dlist_entry *head, + struct fi_info *info) +{ + struct lnx_fi_cache_entry *e = calloc(1, sizeof(*e)); + + if (!e) + return -FI_ENOMEM; + dlist_init(&e->entry); + e->fi = info; + + dlist_insert_tail(&e->entry, head); + + return 0; +} + +struct fi_info * +lnx_get_link_by_dom(char *domain_name) +{ + struct fi_info *info; + struct lnx_fi_info_meta *e; + + dlist_foreach_container(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry) { + info = e->lnx_rep; + if (info && info->domain_attr) { + if (!strcmp(domain_name, + info->domain_attr->name)) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Found %s\n", + info->fabric_attr->prov_name); + return e->lnx_link; + } + } + } + + return NULL; +} + +static void lnx_insert_tail(struct fi_info *head, struct fi_info *item) +{ + struct fi_info *itr = head; + + while (itr->next) + itr = itr->next; + itr->next = item; +} + +static void lnx_remove_tail(struct fi_info **head) +{ + struct fi_info *itr = *head, *prev = NULL; + + while (itr->next) { + prev = itr; + itr = itr->next; + } + + if (prev) + prev->next = NULL; + else + *head = NULL; + free(itr); +} + +static struct fi_info *lnx_dupinfo_list(struct fi_info *l) +{ + struct fi_info *itr, *new, *prev = NULL, *head = NULL; + + for (itr = l; itr; itr = itr->next) { + new = fi_dupinfo(itr); + if (!new) { + if (head) + fi_freeinfo(head); + return NULL; + } + + if (!head) + head = new; + + if (prev) { + prev->next = new; + prev = new; + } else { + prev = new; + } + } + + return head; +} + +static int gen_links_rec(struct dlist_entry *current, struct dlist_entry *head, + struct dlist_entry *result, struct fi_info *l, + int depth, int target_depth) +{ + int rc; + struct fi_info *itr; + struct fi_info *fi_copy, *dup; + struct lnx_fi_cache_entry *e, *new; + + while(current->next != head) { + e = container_of(current->next, struct lnx_fi_cache_entry, entry); + for (itr = e->fi; itr; itr = itr->next) { + fi_copy = fi_dupinfo(itr); + if (l) { + lnx_insert_tail(l, fi_copy); + } else { + l = fi_copy; + } + if (current->next->next == head && + depth == target_depth) { + dup = lnx_dupinfo_list(l); + if (!dup) + return -FI_ENOMEM; + new = calloc(1, sizeof(*new)); + if (!new) + return -FI_ENOMEM; + new->fi = dup; + dlist_init(&new->entry); + dlist_insert_tail(&new->entry, result); + } + rc = gen_links_rec(current->next, head, result, l, + depth+1, target_depth); + lnx_remove_tail(&l); + if (rc) + return rc; + } + current = current->next; + } + + return FI_SUCCESS; +} + +static int gen_links(struct dlist_entry *head, struct dlist_entry *result, + int target_depth) +{ + return gen_links_rec(head, head, result, NULL, 1, target_depth); +} + +static int lnx_form_info(struct fi_info *fi, struct fi_info **out) +{ + int size_prov = 0, size_dom = 0, rc = FI_SUCCESS; + struct lnx_fi_info_meta *meta = NULL; + char *lnx_prov, *lnx_dom, *s; + struct fi_info *itr, *r = NULL; + bool copy = false; + uint64_t min_inject_size = SIZE_MAX; + + for (itr = fi; itr; itr = itr->next) { + size_prov += strlen(itr->fabric_attr->prov_name)+1; + size_dom += strlen(itr->domain_attr->name)+1; + if (itr->tx_attr && itr->tx_attr->inject_size < min_inject_size) + min_inject_size = itr->tx_attr->inject_size; + } + + lnx_dom = calloc(size_dom, sizeof(char)); + lnx_prov = calloc(size_prov, sizeof(char)); + if (!lnx_prov || !lnx_dom) + return -FI_ENOMEM; + + for (itr = fi; itr; itr = itr->next) { + strcat(lnx_prov, itr->fabric_attr->prov_name); + strcat(lnx_dom, itr->domain_attr->name); + if (itr->next) { + strcat(lnx_dom, "+"); + strcat(lnx_prov, "+"); + } + if (!strncmp(itr->fabric_attr->prov_name, "shm", 3)) + continue; + + if (!copy) { + meta = calloc(1, sizeof(*meta)); + r = fi_dupinfo(itr); + if (!r || !meta) { + rc = -FI_ENOMEM; + goto fail; + } + r->domain_attr->av_type = FI_AV_TABLE; + meta->lnx_rep = r; + meta->lnx_link = fi; + if (r->tx_attr) + r->tx_attr->inject_size = min_inject_size; + dlist_init(&meta->entry); + dlist_insert_tail(&meta->entry, &lnx_links_meta); + copy = true; + } + } + + if (!r) { + rc = -FI_ENODATA; + goto fail; + } + + free(r->fabric_attr->prov_name); + free(r->fabric_attr->name); + free(r->domain_attr->name); + + r->fabric_attr->name = NULL; + r->domain_attr->name = NULL; + r->fabric_attr->prov_name = lnx_prov; + + if (asprintf(&s, "%s", lnx_info.fabric_attr->name) < 0) + goto fail; + r->fabric_attr->name = s; + + if (asprintf(&s, "%s:%s", lnx_dom, lnx_info.domain_attr->name) < 0) + goto fail; + r->domain_attr->name = s; + free(lnx_dom); + + *out = r; + return FI_SUCCESS; + +fail: + if (meta) + free(meta); + if (r) + fi_freeinfo(r); + free(lnx_dom); + return rc; +} + +static int lnx_generate_info(struct fi_info **info) +{ + struct fi_info *fi = NULL, *head = NULL, *prev = NULL; + struct lnx_fi_cache_entry *e; + int rc, size; + + /* we need at least 2 providers to link */ + rc = lnx_get_cache_meta(&lnx_fi_info_cache, &size); + if (rc || size < 2) + return -FI_ENODATA; + + rc = gen_links(&lnx_fi_info_cache, &lnx_links, size); + if (rc) + return rc; + + /* + * 1. Iterate over the links and create a linked list of fi_infos + * each fi_info in the list represents one of the links + * 2. Have metadata associated with each fi_info to refer back to + * an entry in the lnx_links cache. + * 3. When the application selects one of these fi_infos, we can + * then find the appropriate link in the cache and be able to + * create the underlying core providers correctly. + */ + dlist_foreach_container(&lnx_links, struct lnx_fi_cache_entry, e, + entry) { + rc = lnx_form_info(e->fi, &fi); + if (rc) + goto err; + + if (prev) { + prev->next = fi; + prev = fi; + } else { + prev = fi; + head = fi; + } + } + + *info = head; + + return FI_SUCCESS; + +err: + if (fi) + fi_freeinfo(fi); + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + return -FI_ENODATA; +} + +int lnx_getinfo_helper(uint32_t version, char *prov, struct fi_info *lnx_hints) +{ + int rc; + char *orig_prov_name = NULL; + struct fi_info *core_info; + uint64_t caps, mr_mode; + bool shm = false; + + caps = lnx_hints->caps; + mr_mode = lnx_hints->domain_attr->mr_mode; + + if (lnx_hints->fabric_attr->prov_name) { + orig_prov_name = lnx_hints->fabric_attr->prov_name; + lnx_hints->fabric_attr->prov_name = NULL; + } + + lnx_hints->fabric_attr->prov_name = prov; + if (!strncmp(prov, "shm", 3)) { + shm = true; + /* make sure we get the correct shm provider */ + lnx_hints->caps &= ~(FI_REMOTE_COMM | FI_LOCAL_COMM); + lnx_hints->caps |= FI_HMEM; + lnx_hints->domain_attr->mr_mode |= (FI_MR_VIRT_ADDR | FI_MR_HMEM + | FI_MR_PROV_KEY); + } + rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_HIDDEN, + lnx_hints, &core_info); + + lnx_hints->fabric_attr->prov_name = orig_prov_name; + if (rc) + return rc; + + if (shm) { + lnx_hints->caps = caps; + lnx_hints->domain_attr->mr_mode = mr_mode; + } + + rc = lnx_cache_info(&lnx_fi_info_cache, core_info); + + return rc; +} + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info) +{ + int rc; + struct fi_info *lnx_hints; + char *linked_provs, *linked_provs_cp, *token, *exclude = NULL; + + rc = fi_param_get_str(&lnx_prov, "prov_links", + &linked_provs); + if (rc) + return rc; + + if (strstr(linked_provs, "lnx")) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Can't specify the lnx provider as part of the link: %s\n", + linked_provs); + return -FI_EINVAL; + } + + linked_provs_cp = strdup(linked_provs); + if (!linked_provs_cp) + return -FI_ENOMEM; + + /* The assumption is that the entire series of + * lnx_getinfo()->lnx_fabric()->lnx_domain()->lnx_endpoint() are + * going to be called before another lnx_getinfo() is called again. + * Based on this assumption, we will free the cache whenever + * lnx_getinfo() is called + */ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + /* If the hints are not provided then we endup with a new block */ + lnx_hints = fi_dupinfo(hints); + if (!lnx_hints) + return -FI_ENOMEM; + + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, lnx_prov.name); + if (rc) + return rc; + + /* get the providers which support peer functionality. These are + * the only ones we can link*/ + lnx_hints->caps |= FI_PEER; + + token = strtok(linked_provs_cp, "+"); + while (token) { + lnx_getinfo_helper(version, token, lnx_hints); + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, token); + if (rc) + goto free_hints; + token = strtok(NULL, "+"); + } + free(linked_provs_cp); + + /* Generate the lnx info which represents all possible combination + * of domains which are to be linked. + */ + rc = lnx_generate_info(info); + +free_hints: + free(exclude); + fi_freeinfo(lnx_hints); + return rc; +} + +static struct local_prov * +lnx_get_local_prov(struct dlist_entry *prov_table, char *prov_name) +{ + struct local_prov *entry; + + /* close all the open core fabrics */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) + return entry; + } + + return NULL; +} + +static int +lnx_add_ep_to_prov(struct local_prov *prov, struct local_prov_ep *ep) +{ + dlist_insert_tail(&ep->entry, &prov->lpv_prov_eps); + ep->lpe_parent = prov; + prov->lpv_ep_count++; + + return FI_SUCCESS; +} + +static int +lnx_setup_core_prov(struct fi_info *info, struct dlist_entry *prov_table, + struct local_prov **shm_prov, void *context) +{ + int rc = -FI_EINVAL; + struct local_prov_ep *ep = NULL; + struct local_prov *lprov, *new_lprov = NULL; + + ep = calloc(sizeof(*ep), 1); + if (!ep) + return -FI_ENOMEM; + + new_lprov = calloc(sizeof(*new_lprov), 1); + if (!new_lprov) + goto free_entry; + + dlist_init(&new_lprov->lpv_prov_eps); + + rc = fi_fabric(info->fabric_attr, &ep->lpe_fabric, context); + if (rc) + return rc; + + ep->lpe_fi_info = info; + strncpy(ep->lpe_fabric_name, info->fabric_attr->name, + FI_NAME_MAX - 1); + + lprov = lnx_get_local_prov(prov_table, info->fabric_attr->prov_name); + if (!lprov) { + lprov = new_lprov; + new_lprov = NULL; + strncpy(lprov->lpv_prov_name, info->fabric_attr->prov_name, + FI_NAME_MAX - 1); + } else { + free(new_lprov); + } + + /* indicate that this fabric can be used for on-node communication */ + if (!strncasecmp(lprov->lpv_prov_name, "shm", 3)) { + *shm_prov = lprov; + ep->lpe_local = true; + } + + dlist_init(&ep->entry); + rc = lnx_add_ep_to_prov(lprov, ep); + if (rc) + goto free_all; + + dlist_insert_after(&lprov->lpv_entry, prov_table); + + return 0; + +free_all: + if (new_lprov) + free(new_lprov); +free_entry: + if (ep) + free(ep); + + return rc; +} + +int +lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context) +{ + int rc; + struct fi_info *link, *itr; + + link = lnx_get_link_by_dom(name); + if (!link) + return -FI_ENODATA; + + for (itr = link; itr; itr = itr->next) { + rc = lnx_setup_core_prov(itr, &lnx_fab->local_prov_table, + &lnx_fab->shm_prov, context); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context) +{ + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_fabric *lnx_fab; + int rc; + + lnx_fab = calloc(sizeof(*lnx_fab), 1); + if (!lnx_fab) + return -FI_ENOMEM; + + bp_attrs.size = sizeof(struct lnx_mr); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT32_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &lnx_fab->mem_reg_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create memory registration buffer pool"); + free(lnx_fab); + return -FI_ENOMEM; + } + + /* initialize the provider table */ + dlist_init(&lnx_fab->local_prov_table); + + rc = ofi_fabric_init(&lnx_prov, lnx_info.fabric_attr, + lnx_info.fabric_attr, + &lnx_fab->util_fabric, context); + if (rc) + goto fail; + + lnx_fab->util_fabric.fabric_fid.fid.ops = &lnx_fabric_fi_ops; + lnx_fab->util_fabric.fabric_fid.ops = &lnx_fabric_ops; + *fabric = &lnx_fab->util_fabric.fabric_fid; + + return 0; + +fail: + return rc; +} + +void lnx_fini(void) +{ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + ofi_bufpool_destroy(global_recv_bp); +} + +static int lnx_free_ep(struct local_prov *prov, struct local_prov_ep *ep) +{ + int rc; + + if (!prov || !ep) + return FI_SUCCESS; + + rc = fi_close(&ep->lpe_fabric->fid); + fi_freeinfo(ep->lpe_fi_info); + free(ep); + prov->lpv_ep_count--; + + if (prov->lpv_ep_count == 0) + dlist_remove(&prov->lpv_entry); + + return rc; +} + +static int lnx_free_eps(struct local_prov *prov) +{ + int rc, frc = 0; + struct dlist_entry *tmp; + struct local_prov_ep *ep; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + dlist_remove(&ep->entry); + rc = lnx_free_ep(prov, ep); + if (rc) + frc = rc; + } + + return frc; +} + +int lnx_fabric_close(struct fid *fid) +{ + int rc = 0; + struct util_fabric *fabric; + struct lnx_fabric *lnx_fab; + struct local_prov *entry; + struct dlist_entry *tmp; + + fabric = container_of(fid, struct util_fabric, fabric_fid.fid); + lnx_fab = container_of(fabric, struct lnx_fabric, util_fabric); + + /* close all the open core fabrics */ + dlist_foreach_container_safe(&lnx_fab->local_prov_table, + struct local_prov, entry, lpv_entry, tmp) { + dlist_remove(&entry->lpv_entry); + rc = lnx_free_eps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close provider %s\n", + entry->lpv_prov_name); + + free(entry); + } + + /* free mr registration pool */ + ofi_bufpool_destroy(lnx_fab->mem_reg_bp); + + rc = ofi_fabric_close(fabric); + + return rc; +} + +void ofi_link_fini(void) +{ + lnx_prov.cleanup(); +} + +LNX_INI +{ + struct ofi_bufpool_attr bp_attrs = {}; + int ret; + + fi_param_define(&lnx_prov, "prov_links", FI_PARAM_STRING, + "Specify which providers LNX will link together. Format: " + "++...+. EX: shm+cxi"); + + fi_param_define(&lnx_prov, "disable_shm", FI_PARAM_BOOL, + "Turn off SHM support. Defaults to 0"); + + fi_param_define(&lnx_prov, "use_srq", FI_PARAM_BOOL, + "Turns shared receive queue support on and off. By default it is on. " + "When SRQ is turned on some Hardware offload capability will not " + "work. EX: Hardware Tag matching"); + + dlist_init(&lnx_fi_info_cache); + dlist_init(&lnx_links); + dlist_init(&lnx_links_meta); + + if (!global_recv_bp) { + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + ret = ofi_bufpool_create_attr(&bp_attrs, &global_recv_bp); + if (ret) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return NULL; + } + ofi_spin_init(&global_bplock); + } + + return &lnx_prov; +} diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c new file mode 100644 index 00000000000..2c6b725c0ac --- /dev/null +++ b/prov/lnx/src/lnx_ops.c @@ -0,0 +1,1060 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "ofi_lock.h" +#include "rdma/fi_ext.h" +#include "ofi_iov.h" +#include "lnx.h" + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + return -FI_ENOSYS; +} + +int lnx_queue_msg(struct fi_peer_rx_entry *entry) +{ + return -FI_ENOSYS; +} + +void lnx_free_entry(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *) entry; + ofi_spin_t *bplock; + + if (rx_entry->rx_global) + bplock = &global_bplock; + else + bplock = &rx_entry->rx_cep->lpe_bplock; + + ofi_spin_lock(bplock); + ofi_buf_free(rx_entry); + ofi_spin_unlock(bplock); +} + +static struct lnx_ep *lnx_get_lep(struct fid_ep *ep, struct lnx_ctx **ctx) +{ + struct lnx_ep *lep; + + if (ctx) + *ctx = NULL; + + switch (ep->fid.fclass) { + case FI_CLASS_RX_CTX: + case FI_CLASS_TX_CTX: + *ctx = container_of(ep, struct lnx_ctx, ctx_ep.fid); + lep = (*ctx)->ctx_parent; + break; + case FI_CLASS_EP: + case FI_CLASS_SEP: + lep = container_of(ep, struct lnx_ep, le_ep.ep_fid.fid); + break; + default: + lep = NULL; + } + + return lep; +} + +static struct fid_ep *lnx_get_core_ep(struct local_prov_ep *cep, int idx, + size_t fclass) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + return cep->lpe_rxc[idx]; + case FI_CLASS_TX_CTX: + return cep->lpe_txc[idx]; + case FI_CLASS_EP: + case FI_CLASS_SEP: + return cep->lpe_ep; + default: + return NULL; + } + + return NULL; +} + +static void +lnx_init_rx_entry(struct lnx_rx_entry *entry, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + memcpy(&entry->rx_iov, iov, sizeof(*iov) * count); + if (desc) + memcpy(entry->rx_desc, desc, sizeof(*desc) * count); + + entry->rx_entry.iov = entry->rx_iov; + entry->rx_entry.desc = entry->rx_desc; + entry->rx_entry.count = count; + entry->rx_entry.addr = addr; + entry->rx_entry.context = context; + entry->rx_entry.tag = tag; + entry->rx_entry.flags = flags; + entry->rx_ignore = ignore; +} + +static struct lnx_rx_entry * +get_rx_entry(struct local_prov_ep *cep, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + struct lnx_rx_entry *rx_entry = NULL; + ofi_spin_t *bplock; + struct ofi_bufpool *bp; + + /* if lp is NULL, then we don't know where the message is going to + * come from, so allocate the rx_entry from a global pool + */ + if (!cep) { + bp = global_recv_bp; + bplock = &global_bplock; + } else { + bp = cep->lpe_recv_bp; + bplock = &cep->lpe_bplock; + } + + ofi_spin_lock(bplock); + rx_entry = (struct lnx_rx_entry *)ofi_buf_alloc(bp); + ofi_spin_unlock(bplock); + if (rx_entry) { + memset(rx_entry, 0, sizeof(*rx_entry)); + if (!cep) + rx_entry->rx_global = true; + rx_entry->rx_cep = cep; + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, + ignore, context, flags); + } + + return rx_entry; +} + +static inline struct lnx_rx_entry * +lnx_remove_first_match(struct lnx_queue *q, struct lnx_match_attr *match) +{ + struct lnx_rx_entry *rx_entry; + + ofi_spin_lock(&q->lq_qlock); + rx_entry = (struct lnx_rx_entry *) dlist_remove_first_match( + &q->lq_queue, q->lq_match_func, match); + ofi_spin_unlock(&q->lq_qlock); + + return rx_entry; +} + +static inline void +lnx_insert_rx_entry(struct lnx_queue *q, struct lnx_rx_entry *entry) +{ + ofi_spin_lock(&q->lq_qlock); + dlist_insert_tail((struct dlist_entry *)(&entry->rx_entry), + &q->lq_queue); + ofi_spin_unlock(&q->lq_qlock); +} + +int lnx_queue_tag(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *)entry; + struct lnx_peer_srq *lnx_srq = (struct lnx_peer_srq*)entry->owner_context; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + entry->addr, entry->tag); + + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_unexq, rx_entry); + + return 0; +} + +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + struct lnx_match_attr match_attr; + struct lnx_peer_srq *lnx_srq; + struct local_prov_ep *cep; + struct lnx_ep *lep; + struct lnx_rx_entry *rx_entry; + fi_addr_t addr = match->addr; + struct lnx_srx_context *srx_ctxt; + uint64_t tag = match->tag; + int rc = 0; + + /* get the endpoint */ + cep = container_of(srx, struct local_prov_ep, lpe_srx); + srx_ctxt = cep->lpe_srx.ep_fid.fid.context; + cep = srx_ctxt->srx_cep; + lep = srx_ctxt->srx_lep; + lnx_srq = &lep->le_srq; + + /* The fi_addr_t is a generic address returned by the provider. It's usually + * just an index or id in their AV table. When I get it here, I could have + * duplicates if multiple providers are using the same scheme to + * insert in the AV table. I need to be able to identify the provider + * in this function so I'm able to correctly match this message to + * a possible rx entry on my receive queue. That's why we need to make + * sure we use the core endpoint as part of the matching key. + */ + memset(&match_attr, 0, sizeof(match_attr)); + + match_attr.lm_addr = addr; + match_attr.lm_ignore = 0; + match_attr.lm_tag = tag; + match_attr.lm_cep = cep; + + /* 1. Find a matching request to the message received. + * 2. Return the receive request. + * 3. If there are no matching requests, then create a new one + * and return it to the core provider. The core provider will turn + * around and tell us to queue it. Return -FI_ENOENT. + */ + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_recvq, + &match_attr); + if (rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + addr, tag); + + goto assign; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 not found\n", + addr, tag); + + rx_entry = get_rx_entry(cep, NULL, NULL, 0, addr, tag, 0, NULL, + lnx_ep_rx_flags(lep)); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + + rx_entry->rx_match_info = *match; + rx_entry->rx_entry.owner_context = lnx_srq; + rx_entry->rx_entry.msg_size = match->msg_size; + + rc = -FI_ENOENT; + +assign: + rx_entry->rx_entry.msg_size = MIN(rx_entry->rx_entry.msg_size, + match->msg_size); + *entry = &rx_entry->rx_entry; + +out: + return rc; +} + +/* + * if lp is NULL, then we're attempting to receive from any peer so + * matching the tag is the only thing that matters. + * + * if lp != NULL, then we're attempting to receive from a particular + * peer. This peer can have multiple endpoints serviced by different core + * providers. + * + * Therefore when we check the unexpected queue, we need to check + * if we received any messages from any of the peer's addresses. If we + * find one, then we kick the core provider associated with that + * address to receive the message. + * + * If nothing is found on the unexpected messages, then add a receive + * request on the SRQ; happens in the lnx_process_recv() + */ +static int lnx_process_recv(struct lnx_ep *lep, struct iovec *iov, void **desc, + fi_addr_t addr, size_t count, struct lnx_peer *lp, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged) +{ + struct lnx_peer_srq *lnx_srq = &lep->le_srq; + struct local_prov_ep *cep; + struct lnx_rx_entry *rx_entry; + struct lnx_match_attr match_attr; + int rc = 0; + + match_attr.lm_addr = addr; + match_attr.lm_ignore = ignore; + match_attr.lm_tag = tag; + match_attr.lm_cep = NULL; + match_attr.lm_peer = lp; + + /* if support is turned off, don't go down the SRQ path */ + if (!lep->le_domain->ld_srx_supported) + return -FI_ENOSYS; + + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_unexq, + &match_attr); + if (!rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx not found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + goto nomatch; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + cep = rx_entry->rx_cep; + + /* match is found in the unexpected queue. call into the core + * provider to complete this message + */ + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = MIN(ofi_total_iov_len(iov, count), + rx_entry->rx_entry.msg_size); + if (tagged) + rc = cep->lpe_srx.peer_ops->start_tag(&rx_entry->rx_entry); + else + rc = cep->lpe_srx.peer_ops->start_msg(&rx_entry->rx_entry); + + if (rc == -FI_EINPROGRESS) { + /* this is telling me that more messages can match the same + * rx_entry. So keep it on the queue + */ + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() in progress\n", + addr, tag, ignore); + + goto insert_recvq; + } else if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "start tag failed with %d\n", rc); + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() success\n", + addr, tag, ignore); + + return 0; + +nomatch: + /* nothing on the unexpected queue, then allocate one and put it on + * the receive queue + */ + rx_entry = get_rx_entry(NULL, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = ofi_total_iov_len(iov, count); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + rx_entry->rx_peer = lp; + +insert_recvq: + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_recvq, rx_entry); + +out: + return rc; +} + +ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = buf, .iov_len = len}; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lnx_get_core_desc(desc, &mem_desc); + + /* addr is an index into the peer table. + * This gets us to a peer. Each peer can be reachable on + * multiple endpoints. Each endpoint has its own fi_addr_t which is + * core provider specific. + */ + lp = lnx_av_lookup_addr(peer_tbl, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, &iov, &mem_desc, src_addr, 1, lp, tag, ignore, + context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + else if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "lnx_process_recv failed with %d\n", rc); + + goto out; + +do_recv: + if (lp) + rc = fi_trecv(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + lnx_get_core_desc(*desc, &mem_desc); + + lp = lnx_av_lookup_addr(peer_tbl, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, + &core_addr, iov, count, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, (struct iovec *)iov, &mem_desc, src_addr, + 1, lp, tag, ignore, context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) + rc = fi_trecvv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, + &cep, &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc); + if (rc) + goto out; + } + lnx_get_core_desc(*msg->desc, &mem_desc); + + rc = lnx_process_recv(lep, (struct iovec *)msg->msg_iov, &mem_desc, + msg->addr, msg->iov_count, lp, msg->tag, msg->ignore, + msg->context, flags, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) { + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + rc = fi_trecvmsg(cep->lpe_ep, &core_msg, flags); + } + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*) buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsend(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + void *mem_desc; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, (desc) ? *desc : NULL, &cep, + &core_addr, iov, count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_addr, tag); + + rc = fi_tsendv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, + (msg->desc) ? *msg->desc : NULL, &cep, + &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_msg.addr, core_msg.tag); + + rc = fi_tsendmsg(cep->lpe_ep, &core_msg, flags); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinject(cep->lpe_ep, buf, len, core_addr, tag); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsenddata(cep->lpe_ep, buf, len, mem_desc, + data, core_addr, tag, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinjectdata(cep->lpe_ep, buf, len, data, core_addr, tag); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +static inline ssize_t +lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, src_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma read from %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_read(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma write to %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_write(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_atomic(core_ep, buf, count, mem_desc, + core_addr, addr, key, datatype, op, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, + &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_fetch_atomic(core_ep, buf, count, desc, + result, mem_desc, core_addr, addr, key, + datatype, op, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct lnx_peer *lp; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, &cep, + &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_compare_atomic(core_ep, buf, count, desc, + compare, compare_desc, result, mem_desc, + core_addr, addr, key, datatype, op, context); + + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + +out: + return rc; +} + +struct fi_ops_tagged lnx_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = lnx_trecv, + .recvv = lnx_trecvv, + .recvmsg = lnx_trecvmsg, + .send = lnx_tsend, + .sendv = lnx_tsendv, + .sendmsg = lnx_tsendmsg, + .inject = lnx_tinject, + .senddata = lnx_tsenddata, + .injectdata = lnx_tinjectdata, +}; + +struct fi_ops_msg lnx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_rma lnx_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = lnx_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = lnx_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_atomic lnx_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = lnx_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = lnx_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = lnx_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + + diff --git a/prov/lpp/configure.m4 b/prov/lpp/configure.m4 index 7c447a16cfa..54f864454f7 100644 --- a/prov/lpp/configure.m4 +++ b/prov/lpp/configure.m4 @@ -11,8 +11,11 @@ AC_DEFUN([FI_LPP_CONFIGURE],[ lpp_happy=0 have_lpp_thread_safe=1 + AC_CHECK_DECL([HAVE_ATOMICS], [atomics_happy=1], [atomics_happy=0]) + AS_IF([test "x$macos" = "x1"],[lpp_happy=0], [test x$host_cpu != xx86_64],[lpp_happy=0], + [test x$atomics_happy == "x0"],[lpp_happy=0], [test x"$enable_lpp" != x"no"],[ lpp_happy=1 AC_SUBST(lpp_INCLUDES) diff --git a/prov/opx/configure.m4 b/prov/opx/configure.m4 index 952c7553420..a678e602e72 100644 --- a/prov/opx/configure.m4 +++ b/prov/opx/configure.m4 @@ -90,7 +90,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ AS_CASE([x$OPX_RELIABILITY], [xnone], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_NONE], [xoffload], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_OFFLOAD], - dnl [xruntime], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_RUNTIME], + dnl [xruntime], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_ONLOAD], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_ONLOAD]) AC_SUBST(opx_reliability, [$OPX_RELIABILITY]) @@ -141,7 +141,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ opx_happy=0 ]) ]) - AS_IF([test $opx_happy -eq 1 && test $have_cuda -eq 1],[ + AS_IF([test $opx_happy -eq 1 && (test $have_cuda -eq 1 || test $have_rocr -eq 1)], [ save_CPPFLAGS=$CPPFLAGS CPPFLAGS="-I/usr/include/uapi" AC_COMPILE_IFELSE([AC_LANG_PROGRAM( @@ -154,6 +154,10 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ AC_MSG_NOTICE([hfi1_user.h struct sdma_req_meminfo defined... no]) opx_happy=0 ]) + OPX_PRODUCTION_BUILD_OVERRIDE=${OPX_PRODUCTION_BUILD_OVERRIDE:-""} + AS_IF([test "x$OPX_PRODUCTION_BUILD_OVERRIDE" != "x"], [ + AC_MSG_NOTICE([OPX_PRODUCTION_BUILD_OVERRIDE is set to $OPX_PRODUCTION_BUILD_OVERRIDE]) + ]) CPPFLAGS=$save_CPPFLAGS opx_hfi_version=$(/sbin/modinfo hfi1 -F version) opx_hfi_version_sorted=$(echo -e "10.14.0.0\n$opx_hfi_version" | sort -V | tail -n 1) @@ -164,15 +168,15 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ test $opx_hfi_version != $opx_hfi_version_sorted],[ opx_hfi_dev_override=$(echo $CPPFLAGS | grep -w "DOPX_DEV_OVERRIDE") - AS_IF([test "x$opx_hfi_dev_override" != "x"],[ - AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no, overridden]) + AS_IF([test "x$opx_hfi_dev_override" != "x" -o "x$OPX_PRODUCTION_BUILD_OVERRIDE" != "x"],[ + AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... no, overridden]) ],[ - AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no]) + AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... no]) opx_happy=0 ]) ], - [AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... yes]) + [AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... yes]) ]) AS_IF([test $opx_happy -eq 1],[ AC_MSG_NOTICE([Appending OPX_HMEM to opx_CPPFLAGS]) diff --git a/prov/opx/include/opa_byteorder.h b/prov/opx/include/opa_byteorder.h index 43bf33cf73b..f943ccc96d2 100644 --- a/prov/opx/include/opa_byteorder.h +++ b/prov/opx/include/opa_byteorder.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021,2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -20,7 +20,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021,2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -51,7 +51,7 @@ */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ -/* Copyright (C) 2021-2024 by Cornelis Networks. */ +/* Copyright (C) 2021,2024 by Cornelis Networks. */ #ifndef OPA_BYTEORDER_H #define OPA_BYTEORDER_H @@ -70,6 +70,8 @@ extern "C" { typedef __u16 __le16; typedef __u16 __be16; +typedef __u32 __le24; /* for readability. Only use 24 of 32 bits */ +typedef __u32 __be24; /* for readability. Only use 24 of 32 bits */ typedef __u32 __le32; typedef __u32 __be32; typedef __u64 __le64; @@ -77,6 +79,8 @@ typedef __u64 __be64; static __inline__ __u16 __hfi_fswab16(__u16) __attribute__ ((always_inline)); +static __inline__ __u32 __hfi_fswab24(__u32) + __attribute__ ((always_inline)); static __inline__ __u32 __hfi_fswab32(__u32) __attribute__ ((always_inline)); static __inline__ __u64 __hfi_fswab64(__u64) @@ -85,7 +89,15 @@ static __inline__ __u64 __hfi_fswab64(__u64) static __inline__ __u16 __hfi_fswab16(__u16 x) { return ((x & (__u16) 0x00ffU) << 8) | ((x & (__u16) 0xff00U) >> 8); -} static __inline__ __u32 __hfi_fswab32(__u32 x) { +} + +static __inline__ __u32 __hfi_fswab24(__u32 x) { + return ((x & (__u32) 0x000000ffUL) << 16) | + ((x & (__u32) 0x0000ff00UL) << 0) | + ((x & (__u32) 0x00ff0000UL) >> 16); +} + +static __inline__ __u32 __hfi_fswab32(__u32 x) { return ((x & (__u32) 0x000000ffUL) << 24) | ((x & (__u32) 0x0000ff00UL) << 8) | ((x & (__u32) 0x00ff0000UL) >> 8) @@ -105,6 +117,8 @@ static __inline__ __u64 __hfi_fswab64(__u64 x) { static __inline__ __u16 __cpu_to_le16(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_le24(__le24) + __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_le32(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_le64(__le64) @@ -112,6 +126,8 @@ static __inline__ __u64 __cpu_to_le64(__le64) static __inline__ __u16 __le16_to_cpu(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __le24_to_cpu(__le24) + __attribute__ ((always_inline)); static __inline__ __u32 __le32_to_cpu(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __le64_to_cpu(__le64) @@ -119,6 +135,8 @@ static __inline__ __u64 __le64_to_cpu(__le64) static __inline__ __u16 __cpu_to_be16(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_be24(__be24) + __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_be32(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_be64(__be64) @@ -126,6 +144,8 @@ static __inline__ __u64 __cpu_to_be64(__be64) static __inline__ __u16 __be16_to_cpu(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __be24_to_cpu(__be24) + __attribute__ ((always_inline)); static __inline__ __u32 __be32_to_cpu(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __be64_to_cpu(__be64) @@ -140,6 +160,10 @@ static __inline__ __le16 __cpu_to_le16(__u16 x) { return x; } +static __inline__ __le24 __cpu_to_le24(__u32 x) { + return x; +} + static __inline__ __le32 __cpu_to_le32(__u32 x) { return x; } @@ -155,6 +179,10 @@ static __inline__ __u16 __le16_to_cpu(__le16 x) { return x; } +static __inline__ __u32 __le24_to_cpu(__le24 x) { + return x; +} + static __inline__ __u32 __le32_to_cpu(__le32 x) { return x; } @@ -170,6 +198,10 @@ static __inline__ __be16 __cpu_to_be16(__u16 x) { return __hfi_fswab16(x); } +static __inline__ __be24 __cpu_to_be24(__u32 x) { + return __hfi_fswab24(x); +} + static __inline__ __be32 __cpu_to_be32(__u32 x) { return __hfi_fswab32(x); } @@ -185,6 +217,10 @@ static __inline__ __u16 __be16_to_cpu(__be16 x) { return __hfi_fswab16(x); } +static __inline__ __u32 __be24_to_cpu(__be24 x) { + return __hfi_fswab24(x); +} + static __inline__ __u32 __be32_to_cpu(__be32 x) { return __hfi_fswab32(x); } @@ -202,6 +238,10 @@ static __inline__ __le16 __cpu_to_le16(__u16 x) { return __hfi_fswab16(x); } +static __inline__ __le24 __cpu_to_le24(__u32 x) { + return __hfi_fswab24(x); +} + static __inline__ __le32 __cpu_to_le32(__u32 x) { return __hfi_fswab32(x); } @@ -217,6 +257,10 @@ static __inline__ __u16 __le16_to_cpu(__le16 x) { return __hfi_fswab16(x); } +static __inline__ __u24 __le24_to_cpu(__le24 x) { + return __hfi_fswab24(x); +} + static __inline__ __u32 __le32_to_cpu(__le32 x) { return __hfi_fswab32(x); } @@ -232,6 +276,10 @@ static __inline__ __be16 __cpu_to_be16(__u16 x) { return x; } +static __inline__ __be24 __cpu_to_be24(__u24 x) { + return x; +} + static __inline__ __be32 __cpu_to_be32(__u32 x) { return x; } @@ -247,6 +295,10 @@ static __inline__ __u16 __be16_to_cpu(__be16 x) { return x; } +static __inline__ __u32 __be24_tp_cpu(__be24 x) { + return x; +} + static __inline__ __u32 __be32_to_cpu(__be32 x) { return x; } @@ -262,4 +314,9 @@ static __inline__ __u64 __be64_to_cpu(__be64 x) { #ifdef __cplusplus } /* extern "C" */ #endif + +static __inline__ __be24 __le24_to_be24(__le24 x) { return __hfi_fswab24((__u32)x); } +static __inline__ __be24 __be24_to_le24(__be24 x) { return __hfi_fswab24((__u32)x); } + + #endif /* OPA_BYTEORDER_H */ diff --git a/prov/opx/include/opa_user_gen1.h b/prov/opx/include/opa_user_gen1.h index d09f015f866..ed1ff675eb4 100644 --- a/prov/opx/include/opa_user_gen1.h +++ b/prov/opx/include/opa_user_gen1.h @@ -83,6 +83,7 @@ #include "opa_udebug.h" #include "opa_service.h" #include "opa_user.h" +#include "ofi_mem.h" #define HFI_RHF_USE_EGRBFR_MASK 0x1 #define HFI_RHF_USE_EGRBFR_SHIFT 15 @@ -329,7 +330,10 @@ int opx_hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits); int opx_hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type); /* reset halted send context, error if context is not halted. */ -int opx_hfi_reset_context(struct _hfi_ctrl *ctrl); +int opx_hfi_reset_context(int fd); + +/* ack hfi events */ +int opx_hfi_ack_events(int fd, uint64_t ackbits); /* * Safe version of opx_hfi_[d/q]wordcpy that is guaranteed to only copy each byte once. @@ -567,7 +571,7 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, #endif FI_DBG(&fi_opx_provider, FI_LOG_MR, "OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %lu)\n", - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; @@ -586,9 +590,9 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, FI_WARN(&fi_opx_provider, FI_LOG_MR, "PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %lu), OUTPUTS vaddr [%p - %p] length %u (pages %lu), tidcnt %u\n", errno, strerror(errno), (void*)vaddr, - (void*)(vaddr + *length), *length, (*length) / PAGE_SIZE, + (void*)(vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE], (void*)rettidinfo->vaddr,(void*)(rettidinfo->vaddr + rettidinfo->length), - rettidinfo->length, rettidinfo->length / PAGE_SIZE, + rettidinfo->length, rettidinfo->length / page_sizes[OFI_PAGE_SIZE], rettidinfo->tidcnt); } /* Always update outputs, even on soft errors */ @@ -598,7 +602,7 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, FI_DBG(&fi_opx_provider, FI_LOG_MR, "TID UPDATE IOCTL returned %d errno %d \"%s\" vaddr [%p - %p] length %u (pages %lu), tidcnt %u\n", err, errno, strerror(errno), (void*)vaddr, - (void*)(vaddr + *length), *length, (*length) / PAGE_SIZE, *tidcnt); + (void*)(vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE], *tidcnt); return 0; } @@ -606,13 +610,13 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, if (errno == ENOSPC) { FI_DBG(&fi_opx_provider, FI_LOG_MR, "IOCTL FAILED : No TIDs available, requested range=%p-%p (%u bytes, %lu pages)\n", - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); err = -FI_ENOSPC; } else { FI_WARN(&fi_opx_provider, FI_LOG_MR, "IOCTL FAILED ERR %d errno %d \"%s\" requested range=%p-%p (%u bytes, %lu pages)\n", err, errno, strerror(errno), - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); } /* Hard error, we can't trust these */ diff --git a/prov/opx/include/rdma/fi_direct_atomic.h b/prov/opx/include/rdma/fi_direct_atomic.h index 65487254fd1..61ca69d7bd7 100644 --- a/prov/opx/include/rdma/fi_direct_atomic.h +++ b/prov/opx/include/rdma/fi_direct_atomic.h @@ -48,7 +48,7 @@ extern "C" { #define fi_inject_atomic(ep, buf, count, dest_addr, addr, key, \ datatype, op) \ - (fi_opx_inject_atomic_FABRIC_DIRECT(ep, buf, count, dest_addr,\ + (fi_opx_inject_atomic_FABRIC_DIRECT(ep, buf, count, dest_addr, \ addr, key, datatype, op)) #define fi_fetch_atomic(ep, buf, count, desc, result, result_desc, \ diff --git a/prov/opx/include/rdma/opx/fi_opx.h b/prov/opx/include/rdma/opx/fi_opx.h index 1b344d23ad9..306e1f87ba4 100644 --- a/prov/opx/include/rdma/opx/fi_opx.h +++ b/prov/opx/include/rdma/opx/fi_opx.h @@ -69,7 +69,8 @@ #define FI_OPX_DOMAIN_NAME "ib0" #define FI_OPX_DOMAIN_NAME_PREFIX "ib" -#define FI_OPX_CACHE_LINE_SIZE (64) +#define FI_OPX_CACHE_LINE_SIZE (64) +#define FI_OPX_CACHE_LINE_QWS (FI_OPX_CACHE_LINE_SIZE/sizeof(uint64_t)) #define FI_OPX_MAX_STRLEN (64) @@ -101,12 +102,40 @@ struct fi_opx_daos_hfi_rank { UT_hash_handle hh; /* makes this structure hashable */ }; +/* hfi1 type for bit logic */ enum opx_hfi1_type { OPX_HFI1_UNDEF = 0, // undefined - OPX_HFI1_WFR = 4, // Omni-path (all generations) - OPX_HFI1_JKR = 5 // CN5000 (initial generation) + OPX_HFI1_JKR_9B = 1, // CN5000 built for mixed network. Internal use + OPX_HFI1_WFR = 2, // Omni-path (all generations) + OPX_HFI1_JKR = 4 // CN5000 (initial generation) }; +/* Will remove after 16B SDMA support is finished */ +#define OPX_NO_9B_SUPPORT(_hfi1_type) \ +do { \ + if(!(_hfi1_type & OPX_HFI1_JKR)) { \ + fprintf(stderr, "%s NO JKR 9B SUPPORT for %u %s\n", __func__,\ + _hfi1_type, \ + _hfi1_type & OPX_HFI1_WFR ? "OPX_HFI1_WFR" : \ + _hfi1_type & OPX_HFI1_JKR_9B ? "OPX_HFI1_JKR_9B" : \ + "UNKNOWN" ); \ + if(getenv("OPX_9B_ABORT")) abort(); \ + } \ + assert(_hfi1_type != OPX_HFI1_UNDEF); \ +} while(0) + + +#define OPX_NO_16B_SUPPORT(_hfi1_type) \ +do { \ + if(!(_hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { \ + fprintf(stderr, "%s NO 16B SUPPORT for %u %s\n", __func__, \ + _hfi1_type, _hfi1_type & OPX_HFI1_JKR ? "OPX_HFI1_JKR" : \ + "UNKNOWN" ); \ + if(getenv("OPX_16B_ABORT")) abort(); \ + } \ + assert(_hfi1_type != OPX_HFI1_UNDEF); \ +} while(0) + struct fi_opx_hfi_local_info { struct fi_opx_hfi_local_lookup *hfi_local_lookup_hashmap; enum opx_hfi1_type type; @@ -125,19 +154,13 @@ struct fi_opx_hfi_local_info { #undef OPX_SIM_ENABLED #endif -/* Build constant for JKR/WFR path optimization */ -#if (defined(OPX_WFR) && defined(OPX_JKR)) -/* Both JKR and WFR runtime support (not constant) */ #define OPX_HFI1_TYPE fi_opx_global.hfi_local_info.type -#elif defined(OPX_WFR) -#define OPX_HFI1_TYPE OPX_HFI1_WFR -#elif defined(OPX_JKR) -#define OPX_HFI1_TYPE OPX_HFI1_JKR -#else -/* Currently default to WFR (only) */ -#define OPX_WFR -#define OPX_HFI1_TYPE OPX_HFI1_WFR -#endif + + +/* Default is both JKR and WFR runtime support (no constant), + use a local or global variable */ + +#define OPX_PRE_CN5000 1 struct fi_opx_hfi_local_lookup_key { uint16_t lid; @@ -228,7 +251,7 @@ static const uint64_t FI_OPX_HDRQ_MASK_8192 = 0X000000000003FFE0UL; (FI_OPX_BASE_CAPS | FI_OPX_RXONLY_CAPS) #define FI_OPX_DEFAULT_MODE \ - (FI_CONTEXT2 | FI_ASYNC_IOV) + (FI_ASYNC_IOV) diff --git a/prov/opx/include/rdma/opx/fi_opx_addr.h b/prov/opx/include/rdma/opx/fi_opx_addr.h index cdc4f75a450..5527154490c 100644 --- a/prov/opx/include/rdma/opx/fi_opx_addr.h +++ b/prov/opx/include/rdma/opx/fi_opx_addr.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,8 +47,12 @@ union fi_opx_uid { fi_opx_uid_t fi; struct { - uint16_t endpoint_id;/* node-scoped endpoint identifier */ - uint16_t lid; /* fabric-scoped node identifier (big-endian) */ + uint8_t endpoint_id;/* node-scoped endpoint identifier */ + uint8_t lid_3B; /* fabric-scoped node identifier (3rd byte) */ + uint16_t lid; /* fabric-scoped node identifier (big-endian) */ + } __attribute__((__packed__)); + struct { + uint32_t lid_32; /* fabric-scoped node identifier (big-endian) */ } __attribute__((__packed__)); } __attribute__((__packed__)); diff --git a/prov/opx/include/rdma/opx/fi_opx_atomic.h b/prov/opx/include/rdma/opx/fi_opx_atomic.h index f66a83a0d84..1fb209507f7 100644 --- a/prov/opx/include/rdma/opx/fi_opx_atomic.h +++ b/prov/opx/include/rdma/opx/fi_opx_atomic.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,35 +42,35 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) \ - static inline ssize_t fi_opx_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static inline ssize_t fi_opx_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, fi_addr_t dst_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, \ void *context) \ { \ return fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_inject_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_inject_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dst_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) \ { \ return fi_opx_inject_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, \ - op, LOCK, AV, CAPS, RELIABILITY); \ + op, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_fetch_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_fetch_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, \ void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, \ enum fi_datatype datatype, enum fi_op op, void *context) \ { \ return fi_opx_fetch_atomic_generic(ep, buf, count, desc, result, result_desc, \ dest_addr, addr, key, datatype, op, context, \ - LOCK, AV, CAPS, RELIABILITY); \ + LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_compare_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_compare_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, \ void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, \ @@ -79,14 +79,14 @@ return fi_opx_compare_atomic_generic(ep, buf, count, desc, compare, compare_desc, \ result, result_desc, dest_addr, addr, key, \ datatype, op, context, LOCK, AV, CAPS, \ - RELIABILITY); \ + RELIABILITY, HFI1_TYPE); \ } -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE #ifdef __cplusplus extern "C" { @@ -121,7 +121,8 @@ ssize_t fi_opx_fetch_atomic_generic(struct fid_ep *ep, const void *buf, size_t c uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, @@ -129,13 +130,15 @@ ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); #ifdef __cplusplus } diff --git a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h index ca55f96fb02..66388eb8d8a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h +++ b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,10 @@ /* Number of types in enum fi_cq_format */ #define FI_CQ_FORMAT_COUNT 5 -typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][OFI_RELIABILITY_KIND_COUNT][FI_OPX_COMMS_COUNT]; +/* Number of types in enum opx_hfi1_type */ +#define OPX_HFI1_TYPE_COUNT 3 + +typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][1 /* OFI_RELIABILITY_KIND_ONLOAD */][FI_OPX_COMMS_COUNT][OPX_HFI1_TYPE_COUNT]; static ssize_t fi_opx_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, uint64_t flags) @@ -57,26 +60,23 @@ fi_opx_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, uint64_t flags if (IS_PROGRESS_MANUAL(opx_cq->domain)) { - struct fi_opx_context_ext * ext = - (struct fi_opx_context_ext *) opx_cq->err.head; + struct opx_context *context = + (struct opx_context *) opx_cq->err.head; - if ((ext == NULL) || (ext->opx_context.byte_counter != 0)) { + if ((context == NULL) || (context->byte_counter != 0)) { /* perhaps an in-progress truncated rendezvous receive? */ errno = FI_EAGAIN; return -errno; } - assert(ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT); /* DEBUG */ - const enum fi_threading threading = opx_cq->domain->threading; const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); fi_opx_lock_if_required(&opx_cq->lock, lock_required); ofi_cq_err_memcpy(opx_cq->domain->fabric->fabric_fid.api_version, - buf, &ext->err_entry); + buf, &context->err_entry); slist_remove_head((struct slist *)&opx_cq->err); - OPX_BUF_FREE(ext); - ext = NULL; + OPX_BUF_FREE(context); fi_opx_unlock_if_required(&opx_cq->lock, lock_required); } else { @@ -118,14 +118,14 @@ fi_opx_cq_strerror(struct fid_cq *cq, int prov_errno, const void *err_data, return NULL; } -#define FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - fi_opx_ops_cq_ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ +#define FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + fi_opx_ops_cq_ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ -#define FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ +#define FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ { \ .size = sizeof(struct fi_ops_cq), \ - .read = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, FORMAT, LOCK, RELIABILITY, MASK, CAPS), \ - .readfrom = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, FORMAT, LOCK, RELIABILITY, MASK, CAPS), \ + .read = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE), \ + .readfrom = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE), \ .readerr = fi_opx_cq_readerr, \ .sread = fi_opx_cq_sread, \ .sreadfrom = fi_opx_cq_sreadfrom, \ @@ -133,9 +133,4 @@ fi_opx_cq_strerror(struct fid_cq *cq, int prov_errno, const void *err_data, .strerror = fi_opx_cq_strerror, \ } -#define FI_OPX_CQ_OPS_STRUCT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ -static struct fi_ops_cq \ - FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS) = \ - FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) - #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index df436159f77..bab44eb0631 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -66,17 +66,12 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); #define OPX_FLAGS_OVERRIDE_TRUE (1) #define OPX_FLAGS_OVERRIDE_FALSE (0) -#define OPX_CONTEXT_EXTENDED_TRUE (1) -#define OPX_CONTEXT_EXTENDED_FALSE (0) - #define OPX_MULTI_RECV_TRUE (1) #define OPX_MULTI_RECV_FALSE (0) #define OPX_HMEM_TRUE (1) #define OPX_HMEM_FALSE (0) -#define OPX_CANCEL_CONTEXT_TRUE (1) -#define OPX_CANCEL_CONTEXT_FALSE (0) // #define FI_OPX_TRACE 1 // #define FI_OPX_REMOTE_COMPLETION @@ -99,12 +94,12 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_MSG_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) +#define FI_OPX_MSG_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) -#define FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ static inline ssize_t \ - fi_opx_send_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_send_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, fi_addr_t dest_addr, void *context) \ { \ @@ -116,19 +111,20 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_recv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_recv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, void *buf, size_t len, \ void *desc, fi_addr_t src_addr, void *context) \ { \ return fi_opx_recv_generic(ep, buf, len, desc, \ src_addr, 0, (uint64_t)-1, context, \ - LOCK, AV, FI_MSG, RELIABILITY); \ + LOCK, AV, FI_MSG, RELIABILITY, HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_inject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_inject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ fi_addr_t dest_addr) \ { \ @@ -136,19 +132,21 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); dest_addr, 0, 0, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + 0, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_recvmsg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_recvmsg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE\ (struct fid_ep *ep, const struct fi_msg *msg, \ uint64_t flags) \ { \ return fi_opx_recvmsg_generic(ep, msg, flags, \ - LOCK, AV, RELIABILITY); \ + LOCK, AV, RELIABILITY, HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_senddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_senddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, uint64_t data, fi_addr_t dest_addr, \ void *context) \ @@ -159,12 +157,13 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); AV, /* av_type */ \ 1, /* is_contiguous */ \ 0, /* override_flags */ \ - 0, /* flags */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_injectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_injectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr) \ { \ @@ -172,15 +171,17 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); dest_addr, 0, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } -#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) \ + FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) -#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY +#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE @@ -198,6 +199,7 @@ enum opx_work_type { OPX_WORK_TYPE_TID_SETUP, OPX_WORK_TYPE_LAST }; + OPX_COMPILE_TIME_ASSERT(OPX_WORK_TYPE_SDMA == 0, "OPX_WORK_TYPE_SDMA needs to be 0/first value in the enum!"); @@ -225,15 +227,18 @@ struct fi_opx_stx { struct fi_opx_reliability_service reliability_service; /* ONLOAD only */ uint8_t reliability_rx; /* ONLOAD only */ - /* == CACHE LINE 4,5,6 == */ + /* == CACHE LINE 4-9 == */ struct { - struct fi_opx_hfi1_txe_scb inject; - struct fi_opx_hfi1_txe_scb send; - struct fi_opx_hfi1_txe_scb rzv; + struct fi_opx_hfi1_txe_scb_9B inject; + struct fi_opx_hfi1_txe_scb_9B send; + struct fi_opx_hfi1_txe_scb_9B rzv; + struct fi_opx_hfi1_txe_scb_16B inject_16B; + struct fi_opx_hfi1_txe_scb_16B send_16B; + struct fi_opx_hfi1_txe_scb_16B rzv_16B; } tx; - /* == CACHE LINE 7 == */ + /* == CACHE LINE 10 == */ struct fi_opx_hfi1_rxe_state rxe_state; /* ignored for ofi tx */ int64_t ref_cnt; @@ -255,7 +260,7 @@ struct fi_opx_stx { */ struct fi_opx_ep_tx { - /* == CACHE LINE 0,1 == */ + /* == CACHE LINE 0 == */ volatile union fi_opx_hfi1_pio_state *pio_state; /* 1 qw = 8 bytes */ volatile uint64_t * pio_scb_sop_first; @@ -263,38 +268,49 @@ struct fi_opx_ep_tx { uint16_t pio_max_eager_tx_bytes; uint16_t pio_flow_eager_tx_bytes; - struct fi_opx_hfi1_txe_scb inject; /* qws 5,6, and 7 specified at runtime */ - volatile uint64_t * pio_credits_addr; /* const; only used to infrequently "refresh" credit information */ volatile uint64_t * pio_scb_first; /* const; only eager and rendezvous */ uint64_t cq_bind_flags; - struct fi_opx_context_slist * cq_completed_ptr; + struct slist * cq_completed_ptr; uint32_t do_cq_completion; uint16_t unused_cacheline1; uint8_t force_credit_return; uint8_t use_sdma; - /* == CACHE LINE 2,3 == */ + /* == CACHE LINE 1,2 == */ + struct fi_opx_hfi1_txe_scb_9B inject_9B; /* qws 5,6, and 7 specified at runtime */ - struct fi_opx_hfi1_txe_scb send; - struct fi_opx_hfi1_txe_scb rzv; + /* == CACHE LINE 3,4 == */ + struct fi_opx_hfi1_txe_scb_9B send_9B; - /* == CACHE LINE 4 == */ + /* == CACHE LINE 5,6 == */ + struct fi_opx_hfi1_txe_scb_9B rzv_9B; + + /* == CACHE LINE 7,8 == */ + struct fi_opx_hfi1_txe_scb_16B inject_16B; + + /* == CACHE LINE 9,10 == */ + struct fi_opx_hfi1_txe_scb_16B send_16B; + + /* == CACHE LINE 11,12 == */ + struct fi_opx_hfi1_txe_scb_16B rzv_16B; + + /* == CACHE LINE 13 == */ union fi_opx_addr * av_addr; /* only FI_ADDR_TABLE */ uint64_t av_count; /* only FI_ADDR_TABLE */ uint64_t op_flags; uint64_t caps; uint64_t mode; - struct fi_opx_context_slist * cq_err_ptr; + struct slist * cq_err_ptr; struct fi_opx_cq * cq; - struct fi_opx_context_slist * cq_pending_ptr; /* only rendezvous (typically) */ + struct slist * cq_pending_ptr; /* only rendezvous (typically) */ - /* == CACHE LINE 5 == */ + /* == CACHE LINE 14 == */ struct slist work_pending[OPX_WORK_TYPE_LAST]; - /* == CACHE LINE 6 == */ + /* == CACHE LINE 15 == */ struct slist work_pending_completion; struct ofi_bufpool *work_pending_pool; @@ -302,38 +318,47 @@ struct fi_opx_ep_tx { struct ofi_bufpool *rma_request_pool; struct ofi_bufpool *sdma_work_pool; uint32_t sdma_min_payload_bytes; + uint32_t tid_min_payload_bytes; uint32_t rzv_min_payload_bytes; uint16_t mp_eager_max_payload_bytes; - uint8_t unused_cacheline6[6]; + uint16_t unused_cacheline6; - /* == CACHE LINE 7 == */ + /* == CACHE LINE 16 == */ struct opx_sdma_queue sdma_request_queue; struct slist sdma_pending_queue; struct ofi_bufpool *sdma_request_pool; uint64_t unused_cacheline7[2]; - /* == CACHE LINE 8, ... == */ + /* == CACHE LINE 17, ... == */ int64_t ref_cnt; struct fi_opx_stx *stx; - // struct opx_shm_tx is very large and should go last! struct opx_shm_tx shm; void *mem; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send) == (FI_OPX_CACHE_LINE_SIZE * 2), - "Offset of fi_opx_ep_tx->send should start at cacheline 2!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv) == (FI_OPX_CACHE_LINE_SIZE * 3), - "Offset of fi_opx_ep_tx->rzv should start at cacheline 3!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, av_addr) == (FI_OPX_CACHE_LINE_SIZE * 4), - "Offset of fi_opx_ep_tx->av_addr should start at cacheline 4!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_CACHE_LINE_SIZE * 5), - "Offset of fi_opx_ep_tx->work_pending should start at cacheline 5!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 6), - "Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 6!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 7), - "Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 7!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 8), - "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 8!"); + +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, inject_9B) == (FI_OPX_CACHE_LINE_SIZE * 1), + "Offset of fi_opx_ep_tx->inject_9B should start at cacheline 1!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send_9B) == (FI_OPX_CACHE_LINE_SIZE * 3), + "Offset of fi_opx_ep_tx->send_9B should start at cacheline 3!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv_9B) == (FI_OPX_CACHE_LINE_SIZE * 5), + "Offset of fi_opx_ep_tx->rzv_9B should start at cacheline 5!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, inject_16B) == (FI_OPX_CACHE_LINE_SIZE * 7), + "Offset of fi_opx_ep_tx->inject_16B should start at cacheline 7!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send_16B) == (FI_OPX_CACHE_LINE_SIZE * 9), + "Offset of fi_opx_ep_tx->send_16B should start at cacheline 9!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv_16B) == (FI_OPX_CACHE_LINE_SIZE * 11), + "Offset of fi_opx_ep_tx->rzv_16B should start at cacheline 11!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, av_addr) == (FI_OPX_CACHE_LINE_SIZE * 13), + "Offset of fi_opx_ep_tx->av_addr should start at cacheline 13!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_CACHE_LINE_SIZE * 14), + "Offset of fi_opx_ep_tx->work_pending should start at cacheline 14!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 15), + "Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 15!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 16), + "Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 16!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 17), + "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 17!"); struct fi_opx_ep_rx { @@ -361,21 +386,21 @@ struct fi_opx_ep_rx { struct { struct fi_opx_hfi1_ue_packet_slist ue; /* 3 qws */ - struct fi_opx_context_slist mq; /* 2 qws */ + struct slist mq; /* 2 qws */ } queue[2]; /* 0 = FI_TAGGED, 1 = FI_MSG */ struct { struct fi_opx_hfi1_ue_packet_slist ue; /* 3 qws */ - struct fi_opx_context_slist mq; /* 2 qws */ + struct slist mq; /* 2 qws */ } mp_egr_queue; struct fi_opx_match_ue_hash * match_ue_tag_hash; /* == CACHE LINE 3 == */ - struct fi_opx_context_slist * cq_pending_ptr; - struct fi_opx_context_slist * cq_completed_ptr; + struct slist * cq_pending_ptr; + struct slist * cq_completed_ptr; struct ofi_bufpool * ue_packet_pool; - struct ofi_bufpool * ctx_ext_pool; + struct ofi_bufpool * ctx_pool; uint64_t unused_cacheline_3[4]; /* == CACHE LINE 4 == */ @@ -399,7 +424,7 @@ struct fi_opx_ep_rx { volatile uint64_t * head_register; } egrq __attribute__((__packed__)); - /* == CACHE LINE 5,6 == */ + /* == CACHE LINE 5,6,7,8,9,10,11,12 == */ /* * NOTE: These cachelines are shared between the application-facing @@ -409,11 +434,12 @@ struct fi_opx_ep_rx { * This 'tx' information is used when sending acks, etc. */ struct { - struct fi_opx_hfi1_txe_scb dput; - struct fi_opx_hfi1_txe_scb cts; + struct fi_opx_hfi1_txe_scb_9B dput_9B; + struct fi_opx_hfi1_txe_scb_9B cts_9B; + struct fi_opx_hfi1_txe_scb_16B dput_16B; + struct fi_opx_hfi1_txe_scb_16B cts_16B; } tx; - /* -- non-critical -- */ uint64_t min_multi_recv; struct fi_opx_domain *domain; @@ -422,8 +448,8 @@ struct fi_opx_ep_rx { uint64_t mode; union fi_opx_addr self; - struct fi_opx_context_slist *cq_err_ptr; - struct fi_opx_cq * cq; + struct slist *cq_err_ptr; + struct fi_opx_cq *cq; struct opx_shm_rx shm; void *mem; @@ -568,6 +594,7 @@ OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep, init_send_cntr) == (FI_OPX_CA OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep, lock) == ((FI_OPX_CACHE_LINE_SIZE * 5)+52), "Offset of fi_opx_ep->lock should start before cacheline 6!"); + /* * A 'scalable endpoint' may not be directly specified in a data movement * functions, such as fi_tsend(), as it is only a container for multiple @@ -582,22 +609,19 @@ struct fi_opx_sep { struct fi_opx_av *av; struct fi_info *info; void *memptr; - struct fi_opx_ep *ep[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_hfi1_context *hfi1[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_reliability *reliability[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_tx *tx[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_rx *rx[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep *ep[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_hfi1_context *hfi1[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_reliability *reliability[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_tx *tx[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_rx *rx[FI_OPX_ADDR_SEP_RX_MAX]; - int64_t ref_cnt; + int64_t ref_cnt; } __attribute((aligned(L2_CACHE_LINE_SIZE))); struct fi_opx_rzv_completion { - union { - union fi_opx_context *context; - struct fi_opx_context_ext *extended_context; - }; + struct opx_context *context; uint64_t tid_length; uint64_t tid_vaddr; uint64_t tid_byte_counter; @@ -618,59 +642,67 @@ struct fi_opx_rma_request { __attribute__((noinline)) void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type); void fi_opx_ep_rx_process_header_tag (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type, + uint32_t slid); void fi_opx_ep_rx_process_header_msg (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type, + uint32_t slid); void fi_opx_ep_rx_reliability_process_packet (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs); void fi_opx_ep_rx_append_ue_msg (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters); + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid); void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters); + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid); void fi_opx_ep_rx_append_ue_egr (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, - const size_t payload_bytes); + const size_t payload_bytes, + const uint64_t slid); int fi_opx_ep_tx_check (struct fi_opx_ep_tx * tx, enum fi_av_type av_type); @@ -689,6 +721,7 @@ void fi_opx_ep_clear_credit_return(struct fi_opx_ep *opx_ep) { #define FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep) fi_opx_ep_clear_credit_return(opx_ep) + #include "rdma/opx/fi_opx_fabric_transport.h" #ifdef OPX_DAOS_DEBUG @@ -811,10 +844,12 @@ uint64_t fi_opx_ep_is_matching_packet(const uint64_t origin_tag, } + __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *opx_ep, - union fi_opx_context * context, - const uint64_t kind) + struct opx_context *context, + const uint64_t kind, + const enum opx_hfi1_type hfi1_type) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.match.default_searches); struct fi_opx_hfi1_ue_packet *uepkt = opx_ep->rx->queue[kind].ue.head; @@ -838,7 +873,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *o opx_ep, uepkt->daos_info.rank, uepkt->daos_info.rank_inst, - fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid))) { + opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type))) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.match.default_misses); uepkt = uepkt->next; } @@ -851,21 +886,23 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *o __OPX_FORCE_INLINE__ uint64_t is_match (struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, - union fi_opx_context * context, + const union opx_hfi1_packet_hdr * const hdr, + struct opx_context *context, uint32_t rank, uint32_t rank_inst, - unsigned is_intranode) + unsigned is_intranode, + const uint64_t slid) { const union fi_opx_addr src_addr = { .fi = context->src_addr }; - const fi_opx_uid_t origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr); + const fi_opx_uid_t origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr, slid); const uint64_t ignore = context->ignore; const uint64_t target_tag = context->tag; const uint64_t origin_tag = hdr->match.ofi_tag; const uint64_t target_tag_and_not_ignore = target_tag & ~ignore; const uint64_t origin_tag_and_not_ignore = origin_tag & ~ignore; + const uint64_t answer = ( (origin_tag_and_not_ignore == target_tag_and_not_ignore) && @@ -879,11 +916,19 @@ uint64_t is_match (struct fi_opx_ep * opx_ep, ) ) ); + #ifdef IS_MATCH_DEBUG fprintf(stderr, "%s:%s():%d context = %p, context->src_addr = 0x%016lx, context->ignore = 0x%016lx, context->tag = 0x%016lx, src_addr.uid.fi = 0x%08x\n", __FILE__, __func__, __LINE__, context, context->src_addr, context->ignore, context->tag, src_addr.uid.fi); - fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, - hdr->match.slid, hdr->match.slid, hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, + hdr->lrh_9B.slid, hdr->lrh_9B.slid, hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + } else { + fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x/0x%04lx (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, + htonl((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))),((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))), + htonl((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))), + hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + } fprintf(stderr, "%s:%s():%d hdr->match.ofi_tag = 0x%016lx, target_tag_and_not_ignore = 0x%016lx, origin_tag_and_not_ignore = 0x%016lx, FI_ADDR_UNSPEC = 0x%08lx\n", __FILE__, __func__, __LINE__, hdr->match.ofi_tag, target_tag_and_not_ignore, origin_tag_and_not_ignore, FI_ADDR_UNSPEC); if (opx_ep->daos_info.hfi_rank_enabled && is_intranode) { @@ -917,27 +962,362 @@ uint32_t fi_opx_ep_get_u32_extended_rx (struct fi_opx_ep * opx_ep, } __OPX_FORCE_INLINE__ -void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, - void *context, - const uint64_t is_context_ext, - const int lock_required) +void fi_opx_enqueue_completed(struct slist *queue, struct opx_context *context, const int lock_required) { assert(!lock_required); + assert(context); + context->flags &= ~FI_OPX_CQ_CONTEXT_HMEM; + slist_insert_tail((struct slist_entry *) context, queue); +} + +__OPX_FORCE_INLINE__ +void opx_ep_copy_immediate_data(struct fi_opx_ep * opx_ep, + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, + struct opx_payload_rzv_contig *contiguous, + const uint64_t immediate_byte_count, + const uint64_t immediate_qw_count, + const uint64_t immediate_block, + const uint64_t immediate_tail, + const uint64_t immediate_total, + const size_t xfer_len, + const uint64_t is_hmem, + const enum fi_hmem_iface rbuf_iface, + const uint64_t rbuf_device, + const uint64_t hmem_handle, + uint8_t *rbuf_in) +{ + uint8_t *rbuf = is_hmem ? opx_ep->hmem_copy_buf : rbuf_in; - union fi_opx_context *real_context; + for (int i = 0; i < immediate_byte_count; ++i) { + rbuf[i] = contiguous->immediate_byte[i]; + } + rbuf += immediate_byte_count; - if (is_context_ext) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) context; - real_context = (union fi_opx_context *) ext->msg.op_context; - *real_context = ext->opx_context; - real_context->flags &= ~(FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM); - real_context->next = NULL; - OPX_BUF_FREE(ext); - } else { - real_context = (union fi_opx_context *) context; + uint64_t * rbuf_qw = (uint64_t *)rbuf; + for (int i = 0; i < immediate_qw_count; ++i) { + rbuf_qw[i] = contiguous->immediate_qw[i]; + } + rbuf += immediate_qw_count * sizeof(uint64_t); + + if (immediate_block) { + const uint64_t immediate_fragment = (immediate_byte_count || immediate_qw_count) ? 1 : 0; + memcpy(rbuf, (void *) (&contiguous->cache_line_1 + immediate_fragment), FI_OPX_CACHE_LINE_SIZE); + } + + if (is_hmem && immediate_total) { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, + rbuf_in, opx_ep->hmem_copy_buf, immediate_total, + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + + if (immediate_tail) { + uint8_t *rbuf_start = rbuf_in + xfer_len - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + + if (!is_hmem) { + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + rbuf_start[i] = immediate_info.tail_bytes[i]; + } + } else { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, + immediate_info.tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + } +} + +__OPX_FORCE_INLINE__ +void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, + const union fi_opx_hfi1_packet_payload * const payload, + struct fi_opx_ep * opx_ep, + const uint64_t origin_tag, + const uint8_t opcode, + struct opx_context *context, + const uint64_t is_multi_receive, + const unsigned is_intranode, + const uint64_t is_hmem, + const int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) +{ + assert(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV -- RENDEZVOUS RTS (%X) (begin) context %p is_multi_recv (%lu)\n", + opcode, context, is_multi_receive); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS"); + + const uint64_t ofi_data = hdr->match.ofi_data; + const uint64_t niov = hdr->rendezvous.niov; + const uint64_t xfer_len = hdr->rendezvous.message_length; + const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; + void *recv_buf = context->buf; + struct fi_opx_ep_rx * const rx = opx_ep->rx; + const uint64_t recv_len = context->len; + + if (is_multi_receive) { /* compile-time constant expression */ + assert(FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode) == FI_MSG); + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + struct opx_context * original_multi_recv_context = context; + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); + + assert((((uintptr_t)context) & 0x07) == 0); + context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; + context->buf = recv_buf; + context->len = xfer_len; + context->data = ofi_data; + context->tag = 0; /* tag is not valid for multi-receives */ + context->multi_recv_context = original_multi_recv_context; + context->byte_counter = xfer_len; + context->next = NULL; + uint8_t * rbuf = (uint8_t *)recv_buf; + + if (OFI_LIKELY(is_noncontig)) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_noncontig); + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, niov, + payload->rendezvous.noncontiguous.origin_byte_counter_vaddr, + context, + (uintptr_t)(rbuf), /* receive buffer virtual address */ + FI_HMEM_SYSTEM, /* receive buffer iface */ + 0UL, /* receive buffer device */ + 0UL, /* immediate_data */ + 0UL, /* immediate_end_block_count */ + &payload->rendezvous.noncontiguous.iov[0], + FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + } else { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_contig); + assert(niov == 1); + struct opx_payload_rzv_contig *contiguous = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + ? (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous + : (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous_16B; + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .qw0 = contiguous->immediate_info + }; + const uint64_t immediate_byte_count = (immediate_info.count & OPX_IMMEDIATE_BYTE_COUNT_MASK) + >> OPX_IMMEDIATE_BYTE_COUNT_SHIFT; + const uint64_t immediate_qw_count = (immediate_info.count & OPX_IMMEDIATE_QW_COUNT_MASK) + >> OPX_IMMEDIATE_QW_COUNT_SHIFT; + const uint64_t immediate_block = (immediate_info.count & OPX_IMMEDIATE_BLOCK_MASK) + >> OPX_IMMEDIATE_BLOCK_SHIFT; + const uint64_t immediate_tail = (immediate_info.count & OPX_IMMEDIATE_TAIL_MASK) + >> OPX_IMMEDIATE_TAIL_SHIFT; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block * sizeof(union cacheline); + + const struct fi_opx_hmem_iov src_dst_iov[1] = { + { + .buf = contiguous->src_vaddr, + .len = (contiguous->src_blocks << 6), + .device = contiguous->src_device_id, + .iface = (enum fi_hmem_iface) contiguous->src_iface + } + }; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block); + + context->byte_counter -= immediate_total; + + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, niov, + contiguous->origin_byte_counter_vaddr, + context, + (uintptr_t)(rbuf + immediate_total), /* receive buffer virtual address */ + FI_HMEM_SYSTEM, /* receive buffer iface */ + 0UL, /* receive buffer device */ + immediate_total, + immediate_tail, + src_dst_iov, + FI_OPX_HFI_DPUT_OPCODE_RZV, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + + opx_ep_copy_immediate_data(opx_ep, immediate_info, contiguous, immediate_byte_count, + immediate_qw_count, immediate_block, immediate_tail, + immediate_total, xfer_len, OPX_HMEM_FALSE, FI_HMEM_SYSTEM, + 0ul, OPX_HMEM_NO_HANDLE, rbuf); + } + + uint64_t bytes_consumed = ((xfer_len + 8) & (~0x07ull)) + sizeof(struct opx_context); + original_multi_recv_context->len -= bytes_consumed; + original_multi_recv_context->byte_counter++; // re-using the byte counter as a "pending flag" + original_multi_recv_context->tag = (uintptr_t)opx_ep; // re-using tag to store the ep + original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); + assert(context->next == NULL); + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); + + } else if (OFI_LIKELY(xfer_len <= recv_len)) { + + context->len = xfer_len; + context->data = ofi_data; + context->tag = origin_tag; + context->next = NULL; + context->flags |= FI_RECV | + FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); + + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + + if (OFI_LIKELY(niov == 1)) { + assert(!is_noncontig); + + uint64_t rbuf_device; + enum fi_hmem_iface rbuf_iface; + uint64_t hmem_handle; + if (is_hmem) { /* Branch should compile out */ + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; + rbuf_device = hmem_info->device; + rbuf_iface = hmem_info->iface; + hmem_handle = hmem_info->hmem_dev_reg_handle; + FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] + .recv.rzv); + FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] + .recv.rzv); + } else { + rbuf_device = 0; + hmem_handle = 0; + rbuf_iface = FI_HMEM_SYSTEM; + } + uint8_t * rbuf = (uint8_t *)recv_buf; + + struct opx_payload_rzv_contig *contiguous = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + ? (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous + : (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous_16B; + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .qw0 = contiguous->immediate_info + }; + const uint64_t immediate_byte_count = (immediate_info.count & OPX_IMMEDIATE_BYTE_COUNT_MASK) + >> OPX_IMMEDIATE_BYTE_COUNT_SHIFT; + const uint64_t immediate_qw_count = (immediate_info.count & OPX_IMMEDIATE_QW_COUNT_MASK) + >> OPX_IMMEDIATE_QW_COUNT_SHIFT; + const uint64_t immediate_block = (immediate_info.count & OPX_IMMEDIATE_BLOCK_MASK) + >> OPX_IMMEDIATE_BLOCK_SHIFT; + const uint64_t immediate_tail = (immediate_info.count & OPX_IMMEDIATE_TAIL_MASK) + >> OPX_IMMEDIATE_TAIL_SHIFT; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block * sizeof(union cacheline); + + const struct fi_opx_hmem_iov src_dst_iov[1] = { + { + .buf = contiguous->src_vaddr, + .len = (contiguous->src_blocks << 6), + .device = contiguous->src_device_id, + .iface = (enum fi_hmem_iface) contiguous->src_iface + } + }; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block); + context->byte_counter = xfer_len - immediate_total; + + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, 1, + contiguous->origin_byte_counter_vaddr, + context, + (uintptr_t) (rbuf + immediate_total), + rbuf_iface, + rbuf_device, + immediate_total, + immediate_tail, + src_dst_iov, + FI_OPX_HFI_DPUT_OPCODE_RZV, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + + opx_ep_copy_immediate_data(opx_ep, immediate_info, contiguous, immediate_byte_count, + immediate_qw_count, immediate_block, immediate_tail, + immediate_total, xfer_len, is_hmem, rbuf_iface, + rbuf_device, hmem_handle, rbuf); + } else { + /*fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__); */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "rendezvous non-contiguous source data not implemented; abort\n"); + abort(); + } + + /* post a pending completion event for the individual receive */ + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); + + } else { /* truncation - unlikely */ + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "RENDEZVOUS truncation - xfer_len %lu > recv_len %lu posting error\n", xfer_len, recv_len); + + /* Post a CTS Truncation error (FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC) to unblock the Tx of RTS */ + + context->len = xfer_len; + context->data = ofi_data; + context->tag = origin_tag; + context->next = NULL; + context->byte_counter = 0; + context->flags = FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + + assert(payload != NULL); + + uintptr_t origin_byte_counter_vaddr = is_noncontig ? + payload->rendezvous.noncontiguous.origin_byte_counter_vaddr : + (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? + payload->rendezvous.contiguous.origin_byte_counter_vaddr : + payload->rendezvous.contiguous_16B.origin_byte_counter_vaddr; + FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC(opx_ep, + (const void * const)hdr, + u8_rx, + origin_byte_counter_vaddr, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, hfi1_type); + + /* Post a E_TRUNC to our local RX error queue because a client called receive + with too small a buffer. Tell them about it via the error cq */ + + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = xfer_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; + + context->byte_counter = 0; + context->next = NULL; + + /* post an 'error' completion event */ + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } - fi_opx_context_slist_insert_tail(real_context, queue); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV -- RENDEZVOUS RTS (end) context %p\n",context); } /** @@ -949,28 +1329,23 @@ void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, * \param[in,out] entry Completion entry */ __OPX_FORCE_INLINE__ -void complete_receive_operation_internal (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, +void opx_ep_complete_receive_operation (struct fid_ep *ep, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint64_t origin_tag, - union fi_opx_context ** context_ptr, + struct opx_context *context, const uint8_t opcode, - const uint64_t is_context_ext, const uint64_t is_multi_receive, const unsigned is_intranode, const uint64_t is_hmem, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { - - assert((is_hmem && is_context_ext) || !is_hmem); assert((is_multi_receive && !is_hmem) || !is_multi_receive); struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); struct fi_opx_ep_rx * const rx = opx_ep->rx; - union fi_opx_context *context = *context_ptr; - - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); const uint64_t recv_len = context->len; /* @@ -981,7 +1356,9 @@ void complete_receive_operation_internal (struct fid_ep *ep, */ void * recv_buf = context->buf; - if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) { + OPX_DEBUG_PRINT_HDR(hdr, hfi1_type); + + if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- INJECT (begin)\n"); @@ -998,8 +1375,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, if (send_len) memcpy(recv_buf, (void*)&hdr->inject.app_data_u8[0], send_len); - union fi_opx_context * original_multi_recv_context = context; - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + struct opx_context * original_multi_recv_context = context; + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); assert((((uintptr_t)context) & 0x07) == 0); context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; @@ -1012,28 +1389,27 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* the next 'fi_opx_context' must be 8-byte aligned */ - uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); + uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(struct opx_context); original_multi_recv_context->len -= bytes_consumed; original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); /* post a completion event for the individual receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr); } else if (OFI_LIKELY(send_len <= recv_len)) { if (is_hmem && send_len) { - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, hdr->inject.app_data_u8, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.inject); FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.inject); } else { #pragma GCC diagnostic push @@ -1076,10 +1452,11 @@ void complete_receive_operation_internal (struct fid_ep *ep, } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed)\n", send_len, recv_len); + "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed) ofi_data = %ld tag = %ld\n", + send_len, recv_len, ofi_data, origin_tag); - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = send_len; context->data = ofi_data; context->tag = origin_tag; @@ -1087,52 +1464,37 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* post a completion event for the individual receive */ - fi_opx_enqueue_completed(rx->cq_completed_ptr, context, is_context_ext, lock_required); + fi_opx_enqueue_completed(rx->cq_completed_ptr, context, lock_required); } else { /* truncation - unlikely */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "INJECT truncation - send_len %lu > recv_len %lu posting error\n", send_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = send_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = send_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; + context->byte_counter = 0; + context->next = NULL; /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-INJECT"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- INJECT (end)\n"); - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- EAGER (begin)\n"); @@ -1151,9 +1513,9 @@ void complete_receive_operation_internal (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER is_multi_recv\n"); - union fi_opx_context * original_multi_recv_context = context; - //assert(original_multi_recv_context->next == NULL); - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + struct opx_context *original_multi_recv_context = context; + + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); assert((((uintptr_t)context) & 0x07) == 0); context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; context->buf = recv_buf; @@ -1179,14 +1541,14 @@ void complete_receive_operation_internal (struct fid_ep *ep, } /* the next 'fi_opx_context' must be 8-byte aligned */ - uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); + uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(struct opx_context); original_multi_recv_context->len -= bytes_consumed; original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); assert(context->next == NULL); /* post a completion event for the individual receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr); } else if (OFI_LIKELY(send_len <= recv_len)) { @@ -1213,29 +1575,27 @@ void complete_receive_operation_internal (struct fid_ep *ep, } if (is_hmem) { - assert(is_context_ext); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, context->buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.eager); FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.eager); } - /* fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)hdr, __func__, __LINE__); */ + /* fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__); */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "EAGER send_len %lu <= recv_len %lu; enqueue cq (completed)\n", send_len, recv_len); + "EAGER send_len %lu <= recv_len %lu; enqueue cq (completed), tag %#lX/%#lX, ofi_data %#lX \n", send_len, recv_len, context->tag, origin_tag, ofi_data); - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = send_len; context->data = ofi_data; context->tag = origin_tag; @@ -1243,60 +1603,50 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* post a completion event for the individual receive */ - fi_opx_enqueue_completed(rx->cq_completed_ptr, context, is_context_ext, lock_required); + fi_opx_enqueue_completed(rx->cq_completed_ptr, context, lock_required); } else { /* truncation - unlikely */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER truncation - send_len %lu > recv_len %lu posting error\n", send_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = send_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = send_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; + context->byte_counter = 0; + context->next = NULL; /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-EAGER"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- EAGER (end)\n"); - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER FIRST (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-MP-EAGER-FIRST"); const uint64_t ofi_data = hdr->match.ofi_data; - const uint64_t payload_qws_total = (((uint64_t) ntohs(hdr->stl.lrh.pktlen)) - 15) >> 1; + + uint64_t payload_qws_total; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + payload_qws_total = (((uint64_t) ntohs(hdr->lrh_9B.pktlen)) - 15) >> 1; + } else{ + payload_qws_total = (uint64_t)(hdr->lrh_16B.pktlen - 9); + } const uint64_t packet_payload_len = hdr->mp_eager_first.xfer_bytes_tail + (payload_qws_total << 3); const uint64_t payload_total_len = hdr->mp_eager_first.payload_bytes_total & FI_OPX_HFI1_KDETH_VERSION_OFF_MASK; @@ -1337,8 +1687,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, recv_buf_qw[i] = payload_qw[i]; } - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = payload_total_len; context->data = ofi_data; context->tag = origin_tag; @@ -1346,16 +1696,15 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; if (is_hmem) { - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, opx_ep->hmem_copy_buf, packet_payload_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); /* MP Eager sends are never intranode */ FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.mp_eager); } } else { /* truncation - unlikely */ @@ -1363,41 +1712,29 @@ void complete_receive_operation_internal (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER truncation - xfer_len %lu > recv_len %lu posting error\n", payload_total_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error."); - abort(); - } - ext->opx_context = *context; - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = payload_total_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = payload_total_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - - ext->opx_context.byte_counter = payload_total_len - packet_payload_len; - ext->opx_context.next = NULL; - *context_ptr = (union fi_opx_context*)ext; + context->byte_counter = payload_total_len - packet_payload_len; + context->next = NULL; + } +#ifndef NDEBUG + if (context->byte_counter == 0) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER FIRST UNEXPECTED COMPLETE\n"); } +#endif OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-MP-EAGER-FIRST"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- MULTI PACKET EAGER FIRST (end)\n"); + "===================================== RECV -- MULTI PACKET EAGER FIRST byte counter %lu (end)\n",context->byte_counter); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) { @@ -1405,7 +1742,11 @@ void complete_receive_operation_internal (struct fid_ep *ep, "===================================== RECV -- MULTI PACKET EAGER NTH (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-MP-EAGER-NTH"); - const uint64_t payload_qws_total = (((uint64_t) ntohs(hdr->stl.lrh.pktlen)) - 15) >> 1; + uint64_t payload_qws_total; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + payload_qws_total = (((uint64_t) ntohs(hdr->lrh_9B.pktlen)) - 15) >> 1; + else + payload_qws_total = (uint64_t) hdr->lrh_16B.pktlen - 9; const uint64_t send_len = hdr->mp_eager_nth.xfer_bytes_tail + (payload_qws_total << 3); const uint64_t xfer_len = send_len + hdr->mp_eager_nth.payload_offset; @@ -1413,8 +1754,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, /* If we flagged this context w/ an error, just decrement the byte counter that this * nth packet would have filled in */ - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { context->byte_counter -= send_len; return; } @@ -1481,19 +1821,23 @@ void complete_receive_operation_internal (struct fid_ep *ep, if (is_hmem) { recv_buf = (void*)((uint8_t*) context->buf + hdr->mp_eager_nth.payload_offset); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); } - /* fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)hdr, __func__, __LINE__); */ + /* fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__);*/ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Multi-packet EAGER (nth) send_len %lu <= recv_len %lu; enqueue cq (pending)\n", send_len, recv_len); assert(context->byte_counter >= send_len); context->byte_counter -= send_len; +#ifndef NDEBUG + if (context->byte_counter == 0) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER NTH COMPLETE\n"); + } +#endif } else { /* truncation - unlikely */ /* We verified the context had enough buffer space for the entire multi-packet payload * when we processed the first multi-egr packet. So if xver_len > recv_len, then something @@ -1505,426 +1849,15 @@ void complete_receive_operation_internal (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-MP-EAGER-NTH"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- MULTI PACKET EAGER NTH (end)\n"); - - } else { /* rendezvous packet */ - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- RENDEZVOUS RTS (%X) (begin) context %p is_multi_recv (%lu)\n", - opcode, context, is_multi_receive); - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS"); - - const uint64_t ofi_data = hdr->match.ofi_data; - const uint64_t niov = hdr->rendezvous.niov; - const uint64_t xfer_len = hdr->rendezvous.message_length; - const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; - - if (is_multi_receive) { /* compile-time constant expression */ - assert(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - union fi_opx_context * original_multi_recv_context = context; - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); - - assert((((uintptr_t)context) & 0x07) == 0); - context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; - context->buf = recv_buf; - context->len = xfer_len; - context->data = ofi_data; - context->tag = 0; /* tag is not valid for multi-receives */ - context->multi_recv_context = original_multi_recv_context; - context->byte_counter = xfer_len; - context->next = NULL; - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; - - if (OFI_LIKELY(is_noncontig)) { - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_noncontig); - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, niov, - p->rendezvous.noncontiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - 0UL, /* immediate_data */ - 0UL, /* immediate_end_block_count */ - &p->rendezvous.noncontiguous.iov[0], - FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - } else { - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_contig); - assert(niov == 1); - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .qw0 = p->rendezvous.contiguous.immediate_info - }; - const uint64_t immediate_byte_count = immediate_info.byte_count; - const uint64_t immediate_qw_count = immediate_info.qw_count; - const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); - const uint64_t immediate_block_count = immediate_info.block_count; - const uint64_t immediate_total = immediate_byte_count + - immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - const uint64_t immediate_end_block_count = immediate_info.end_block_count; - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); - - context->byte_counter -= immediate_total; - const struct fi_opx_hmem_iov src_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, niov, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf + immediate_total), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - immediate_total, - immediate_end_block_count, - &src_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - /* - * copy the immediate payload data - */ - unsigned i; - - if (immediate_byte_count) { - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - for (i=0; irendezvous.contiguous.immediate_qw; - uint64_t * rbuf_qw = (uint64_t *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - union cacheline * rbuf_block = (union cacheline *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - uint8_t *rbuf_start = (uint8_t *)recv_buf; - rbuf_start += xfer_len - (immediate_end_block_count << 6); - memcpy(rbuf_start, immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6)); - } - } - - uint64_t bytes_consumed = ((xfer_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); - original_multi_recv_context->len -= bytes_consumed; - original_multi_recv_context->byte_counter++; // re-using the byte counter as a "pending flag" - original_multi_recv_context->tag = (uintptr_t)opx_ep; // re-using tag to store the ep - original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); - assert(context->next == NULL); - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - } else if (OFI_LIKELY(xfer_len <= recv_len)) { - - context->len = xfer_len; - context->data = ofi_data; - context->tag = origin_tag; - context->next = NULL; - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); - - - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - - if (OFI_LIKELY(niov == 1)) { - assert(!is_noncontig); - assert(payload != NULL); - - uint64_t rbuf_device; - enum fi_hmem_iface rbuf_iface; - uint64_t hmem_handle; - if (is_hmem) { /* Branch should compile out */ - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; - rbuf_device = hmem_info->device; - rbuf_iface = hmem_info->iface; - hmem_handle = hmem_info->hmem_dev_reg_handle; - FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .recv.rzv); - FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .recv.rzv); - } else { - rbuf_device = 0; - hmem_handle = 0; - rbuf_iface = FI_HMEM_SYSTEM; - } - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; - - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .qw0 = p->rendezvous.contiguous.immediate_info - }; - const uint64_t immediate_byte_count = immediate_info.byte_count; - const uint64_t immediate_qw_count = immediate_info.qw_count; - const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); - const uint64_t immediate_block_count = immediate_info.block_count; - const uint64_t immediate_total = immediate_byte_count + - immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - const uint64_t immediate_end_block_count = immediate_info.end_block_count; - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); - context->byte_counter = xfer_len - immediate_total; - const struct fi_opx_hmem_iov src_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, 1, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t) (rbuf + immediate_total), - rbuf_iface, - rbuf_device, - immediate_total, - immediate_end_block_count, - &src_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - /* - * copy the immediate payload data - */ - if (is_hmem) { - rbuf = opx_ep->hmem_copy_buf; - } - unsigned i; - - if (immediate_byte_count) { - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - for (i=0; irendezvous.contiguous.immediate_qw; - uint64_t * rbuf_qw = (uint64_t *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - union cacheline * rbuf_block = (union cacheline *)rbuf; - for (i=0; ihmem_copy_buf) + - (immediate_block_count * sizeof(union cacheline)); - if (immediate_total) { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, - recv_buf, opx_ep->hmem_copy_buf, immediate_total, - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - - /* up to 1 block of immediate end data after the immediate blocks - Copy this to the end of rbuf */ - if (immediate_end_block_count) { - const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; - uint8_t *rbuf_start = (uint8_t *)recv_buf; - rbuf_start += xfer_len - (immediate_end_block_count << 6); - if (!is_hmem) { - memcpy(rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6)); - } else { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6), - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - - } else { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "rendezvous non-contiguous source data not implemented; abort\n"); - abort(); - } - - /* post a pending completion event for the individual receive */ - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - - } else { /* truncation - unlikely */ - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "RENDEZVOUS truncation - xfer_len %lu > recv_len %lu posting error\n", xfer_len, recv_len); - - /* Post a CTS Truncation error (FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC) to unblock the Tx of RTS */ - - context->len = xfer_len; - context->data = ofi_data; - context->tag = origin_tag; - context->next = NULL; - context->byte_counter = 0; - context->flags = FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - - assert(payload != NULL); - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; - - const struct fi_opx_hmem_iov dst_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, 1, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - 0UL, /* immediate_data */ - 0UL, /* immediate_end_block_count */ - &dst_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - /* Post a E_TRUNC to our local RX error queue because a client called receive - with too small a buffer. Tell them about it via the error cq */ - - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = xfer_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; - - /* post an 'error' completion event */ - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); - } - - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- RENDEZVOUS RTS (end) context %p\n",context); - - } /* rendezvous packet */ - + "===================================== RECV -- MULTI PACKET EAGER NTH byte counter %lu (end)\n",context->byte_counter); + } else { + fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, + context, is_multi_receive, is_intranode, is_hmem, + lock_required, reliability, hfi1_type); + } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } -/** - * \brief Complete a receive operation that has matched the packet header with - * the match information - * - * \param[in] rx Receive endoint - * \param[in] hdr MU packet header that matched - * \param[in,out] entry Completion entry - */ -__OPX_FORCE_INLINE__ -void complete_receive_operation(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, - const union fi_opx_hfi1_packet_payload * const payload, - const uint64_t origin_tag, - union fi_opx_context * context, - const uint8_t opcode, - const uint64_t is_context_ext, - const uint64_t is_multi_receive, - const unsigned is_intranode, - const uint64_t is_hmem, - const int lock_required, - const enum ofi_reliability_kind reliability) -{ - union fi_opx_context * original_context = context; - (void) original_context; - complete_receive_operation_internal(ep, hdr, payload, origin_tag, &context, - opcode, is_context_ext, is_multi_receive, - is_intranode, is_hmem, lock_required, reliability); - assert(context == original_context); -} - __OPX_FORCE_INLINE__ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, struct fi_opx_ep * opx_ep, @@ -1975,17 +1908,18 @@ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- %s RENDEZVOUS CTS (begin)\n", is_intranode ? "SHM":"HFI"); - assert(payload != NULL); + assert(payload != NULL || hdr->cts.target.opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC); const uint8_t u8_rx = hdr->cts.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->cts.origin_rx); @@ -1998,7 +1932,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, const uint32_t niov = hdr->cts.target.vaddr.niov; uint64_t * origin_byte_counter = (uint64_t *)hdr->cts.target.vaddr.origin_byte_counter_vaddr; OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov, (const uint8_t) (FI_NOOP - 1), (const uint8_t) (FI_VOID - 1), @@ -2007,7 +1941,8 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, hdr->cts.target.opcode, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); } break; @@ -2017,7 +1952,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; const uint32_t niov = hdr->cts.target.vaddr.niov; uint64_t * origin_byte_counter = (uint64_t *)hdr->cts.target.vaddr.origin_byte_counter_vaddr; - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov, (const uint8_t) (FI_NOOP - 1), (const uint8_t) (FI_VOID - 1), @@ -2027,7 +1962,8 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); } break; case FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC: @@ -2068,7 +2004,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, #else const union fi_opx_hfi1_dput_iov * const dput_iov_ptr = payload->cts.iov; #endif - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov_ptr, hdr->cts.target.mr.op, hdr->cts.target.mr.dt, @@ -2079,12 +2015,13 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); } break; case FI_OPX_HFI_DPUT_OPCODE_FENCE: { - opx_hfi1_dput_fence(opx_ep, hdr, u8_rx, u32_ext_rx); + opx_hfi1_dput_fence(opx_ep, hdr, u8_rx, u32_ext_rx, hfi1_type); } break; default: @@ -2100,13 +2037,14 @@ void fi_opx_atomic_completion_action(union fi_opx_hfi1_deferred_work * work_stat __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- %s RENDEZVOUS DATA Opcode=%0hhX (begin)\n", is_intranode ? "SHM":"HFI", hdr->dput.target.opcode); @@ -2116,7 +2054,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, { struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-DATA-HFI-DPUT:%p", rzv_comp); - union fi_opx_context *target_context = rzv_comp->context; + struct opx_context *target_context = rzv_comp->context; assert(target_context); uint64_t* rbuf_qws = (uint64_t *) fi_opx_dput_rbuf_in(hdr->dput.target.rzv.rbuf); @@ -2124,18 +2062,26 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; assert(bytes <= FI_OPX_HFI1_PACKET_MTU); - +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RZV (non-TID) data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif const uint64_t *sbuf_qws = (uint64_t*)&payload->byte[0]; #ifdef OPX_HMEM if (target_context->flags & FI_OPX_CQ_CONTEXT_HMEM) { - assert(target_context->flags & FI_OPX_CQ_CONTEXT_EXT); - struct fi_opx_context_ext *ext = rzv_comp->extended_context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) target_context->hmem_info_qws; assert(hmem_info->iface > FI_HMEM_SYSTEM); opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, rbuf_qws, sbuf_qws, bytes, @@ -2166,25 +2112,35 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RX_PROCESS_HEADER_RZV_TID"); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts); struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); - union fi_opx_context *target_context = rzv_comp->context; + struct opx_context *target_context = rzv_comp->context; assert(target_context); /* TID packets are mixed 4k/8k packets and length adjusted, * so use actual packet size here reported in LRH as the * number of 4-byte words in the packet; header + payload - icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const uint16_t bytes = - (uint16_t)(total_bytes_to_copy - - sizeof(union fi_opx_hfi1_packet_hdr)); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + uint16_t bytes; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = + (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + bytes = (uint16_t)(total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); + } else { + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes_to_copy = + (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + bytes = (uint16_t)((total_bytes_to_copy - + sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B))); + } assert(bytes <= FI_OPX_HFI1_PACKET_MTU); /* SDMA expected receive w/TID will use CTRL 1, 2 or 3. Replays should indicate we are not using TID (CTRL 0) */ - int tidctrl = KDETH_GET(hdr->stl.kdeth.offset_ver_tid, TIDCTRL); + int tidctrl = KDETH_GET(hdr->kdeth.offset_ver_tid, TIDCTRL); assert((tidctrl == 0) || (tidctrl == 1) || (tidctrl == 2) || (tidctrl == 3)); /* Copy only if there's a replay payload and TID direct rdma was NOT done. @@ -2198,8 +2154,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, "TID REPLAY rbuf_qws %p, sbuf_qws %p, bytes %u/%#x, target_context->byte_counter %p\n", (void*)rbuf_qws, (void*)sbuf_qws, bytes, bytes, &target_context->byte_counter); if (target_context->flags & FI_OPX_CQ_CONTEXT_HMEM) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) target_context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) target_context->hmem_info_qws; assert(hmem_info->iface > FI_HMEM_SYSTEM); opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, rbuf_qws, sbuf_qws, bytes, @@ -2216,9 +2171,9 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "NOT REPLAY tidctrl %#x, tid %#X, tid0M %#X, tidoffset %#X rbuf_qws %p, " "sbuf_qws %p, bytes %u/%#x, target_context->byte_counter %p\n", - tidctrl, KDETH_GET(hdr->stl.kdeth.offset_ver_tid, TID), - KDETH_GET(hdr->stl.kdeth.offset_ver_tid, OM), - KDETH_GET(hdr->stl.kdeth.offset_ver_tid, OFFSET), + tidctrl, KDETH_GET(hdr->kdeth.offset_ver_tid, TID), + KDETH_GET(hdr->kdeth.offset_ver_tid, OM), + KDETH_GET(hdr->kdeth.offset_ver_tid, OFFSET), (void*)rbuf_qws, (void*)sbuf_qws, bytes, bytes, &target_context->byte_counter); } @@ -2280,11 +2235,22 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; assert(bytes <= FI_OPX_HFI1_PACKET_MTU); +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RMA PUT data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif // Optimize Memcpy if(hdr->dput.target.op == FI_NOOP - 1 && hdr->dput.target.dt == FI_VOID - 1) { @@ -2313,13 +2279,24 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; assert(cc); assert(bytes <= FI_OPX_HFI1_PACKET_MTU); +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RMA GET data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif if (hdr->dput.target.dt == (FI_VOID - 1)) { OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, OPX_HMEM_NO_HANDLE, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, @@ -2361,10 +2338,21 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received ATOMIC FETCH data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; enum fi_hmem_iface hmem_iface = fi_opx_mr_get_iface(opx_mr, &hmem_device); @@ -2387,7 +2375,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, bytes, u8_rx, origin_rs, 1, &dput_iov, hdr->dput.target.op, @@ -2399,7 +2387,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, fi_opx_atomic_completion_action, is_intranode, reliability, - u32_ext_rx); + u32_ext_rx, + hfi1_type); if(work == NULL) { // The FETCH completed without being deferred, now do // the actual atomic operation. @@ -2433,10 +2422,21 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received ATOMIC COMPARE FETCH data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; enum fi_hmem_iface hmem_iface = fi_opx_mr_get_iface(opx_mr, &hmem_device); @@ -2459,7 +2459,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, bytes, u8_rx, origin_rs, 1, &dput_iov, hdr->dput.target.op, @@ -2471,7 +2471,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, fi_opx_atomic_completion_action, is_intranode, reliability, - u32_ext_rx); + u32_ext_rx, + hfi1_type); if(work == NULL) { // The FETCH completed without being deferred, now do // the actual atomic operation. @@ -2511,7 +2512,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2519,22 +2520,23 @@ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - if (opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { fi_opx_ep_rx_process_header_rzv_cts(opx_ep, hdr, payload, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { fi_opx_ep_rx_process_header_rzv_data(opx_ep, hdr, payload, payload_bytes, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_ACK) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "unimplemented opcode (%u); abort\n", opcode); @@ -2552,48 +2554,65 @@ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, "reliability exception with opcode %d, dropped\n", opcode); } else { fprintf(stderr, "unimplemented opcode (%#x); abort\n", opcode); + fprintf(stderr, "%s:%u payload %p, payload bytes %zu, is_instranode %u, %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX \n", + __func__, __LINE__, payload, payload_bytes, is_intranode, + (long long) hdr->qw_9B[0], + (long long) hdr->qw_9B[1], + (long long) hdr->qw_9B[2], + (long long) hdr->qw_9B[3], + (long long) hdr->qw_9B[4], + (long long) hdr->qw_9B[5], + (long long) hdr->qw_9B[6]); abort(); } } __OPX_FORCE_INLINE__ -uint64_t fi_opx_mp_egr_id_from_nth_packet(const union fi_opx_hfi1_packet_hdr *hdr) { - +uint64_t fi_opx_mp_egr_id_from_nth_packet(const union opx_hfi1_packet_hdr *hdr, + const uint64_t slid) +{ return ((uint64_t) hdr->mp_eager_nth.mp_egr_uid) | (((uint64_t)hdr->reliability.origin_tx) << 48) | - (((uint64_t)hdr->stl.lrh.slid) << 32); + (((uint64_t)slid) << 32); } __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, - union fi_opx_context *context, + struct opx_context *context, union fi_opx_mp_egr_id mp_egr_id, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; const uint64_t is_hmem = context->flags & FI_OPX_CQ_CONTEXT_HMEM; struct fi_opx_hfi1_ue_packet *uepkt = opx_ep->rx->mp_egr_queue.ue.head; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(length); while (uepkt && context->byte_counter) { - if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr) == mp_egr_id.id) { + uint64_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint64_t)(uepkt->hdr.lrh_9B.slid); + } else { + slid = htons(((uepkt->hdr.lrh_16B.slid20 << 20) | (uepkt->hdr.lrh_16B.slid))); + } - complete_receive_operation(ep, + if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr, slid) == mp_egr_id.id) { + + opx_ep_complete_receive_operation(ep, &uepkt->hdr, &uepkt->payload, 0, /* OFI Tag, N/A for multi-packet eager nth */ context, FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - is_context_ext, OPX_MULTI_RECV_FALSE, OPX_INTRANODE_FALSE, is_hmem, lock_required, - reliability); + reliability, + hfi1_type); /* Remove this packet and get the next one */ uepkt = fi_opx_hfi1_ue_packet_slist_remove_item(uepkt, @@ -2609,7 +2628,7 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2617,7 +2636,9 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2626,10 +2647,10 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "search the match queue\n"); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; - assert((kind == FI_OPX_KIND_TAG && opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) || - (kind == FI_OPX_KIND_MSG && opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST)); - union fi_opx_context * context = opx_ep->rx->queue[kind].mq.head; - union fi_opx_context * prev = NULL; + assert((kind == FI_OPX_KIND_TAG && FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) || + (kind == FI_OPX_KIND_MSG && !FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode))); + struct opx_context *context = (struct opx_context *) opx_ep->rx->queue[kind].mq.head; + struct opx_context *prev = NULL; while ( context && @@ -2638,7 +2659,8 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, context, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, - is_intranode) + is_intranode, + slid) ) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "context = %p\n", context); prev = context; @@ -2649,58 +2671,59 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "did not find a match .. add this packet to the unexpected queue\n"); - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)) + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode))) fi_opx_ep_rx_append_ue_tag(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); else fi_opx_ep_rx_append_ue_msg(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); return; } /* Found a match. Remove from the match queue */ - fi_opx_context_slist_remove_item(context, prev, &opx_ep->rx->queue[kind].mq); + slist_remove(&opx_ep->rx->queue[kind].mq, + (struct slist_entry *) context, + (struct slist_entry *) prev); - uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; uint64_t is_hmem = context->flags & FI_OPX_CQ_CONTEXT_HMEM; + /* Copy this packet's payload to the context's buffer. */ - complete_receive_operation_internal(ep, hdr, payload, - hdr->match.ofi_tag, &context, + opx_ep_complete_receive_operation(ep, hdr, payload, + hdr->match.ofi_tag, context, opcode, - is_context_ext, OPX_MULTI_RECV_FALSE, OPX_INTRANODE_FALSE, /* Should always be false for mp_eager */ is_hmem, lock_required, - reliability); + reliability, + hfi1_type); const union fi_opx_mp_egr_id mp_egr_id = { .uid = hdr->reliability.psn, .origin_tx = hdr->reliability.origin_tx, - .slid = hdr->stl.lrh.slid, + .slid = slid, .unused = 0}; /* Process any other early arrival packets that are part of this multi-packet egr */ - fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability); + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability, hfi1_type); /* Only add this to the multi-packet egr queue if we still expect additional packets to come in */ if (context->byte_counter) { context->mp_egr_id = mp_egr_id; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->mp_egr_queue.mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->mp_egr_queue.mq); } else { context->next = NULL; - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_eager_first); } @@ -2708,7 +2731,7 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2716,7 +2739,9 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2724,9 +2749,9 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, /* Search mp-eager queue for the context w/ matching mp-eager ID */ - const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr); - union fi_opx_context *context = opx_ep->rx->mp_egr_queue.mq.head; - union fi_opx_context *prev = NULL; + const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr, slid); + struct opx_context *context = (struct opx_context *) opx_ep->rx->mp_egr_queue.mq.head; + struct opx_context *prev = NULL; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(length); @@ -2745,7 +2770,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, "process_header_mp_eager_nth: did not find a match .. add this packet to the unexpected queue\n"); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_nth_no_match); - fi_opx_ep_rx_append_ue_egr(opx_ep->rx, hdr, payload, payload_bytes); + fi_opx_ep_rx_append_ue_egr(opx_ep->rx, hdr, payload, payload_bytes, slid); return; } @@ -2753,30 +2778,30 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_nth_match); /* We found a match! */ - const uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, hdr, payload, 0, /* OFI Tag, N/A for multi-packet eager nth */ context, opcode, // FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH - is_context_ext, OPX_MULTI_RECV_FALSE, is_intranode, context->flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); if (!context->byte_counter) { /* Remove from the mp-eager queue */ - fi_opx_context_slist_remove_item(context, prev, &opx_ep->rx->mp_egr_queue.mq); + slist_remove(&opx_ep->rx->mp_egr_queue.mq, + (struct slist_entry *) context, + (struct slist_entry *) prev); - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_eager_nth); @@ -2785,7 +2810,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, static inline void fi_opx_ep_rx_process_header (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2793,7 +2818,9 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2803,15 +2830,15 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, hfi1_type); return; - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { fi_opx_ep_rx_process_header_mp_eager_first(ep, hdr, payload, payload_bytes, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type, slid); return; } else if (opcode == FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) { @@ -2819,12 +2846,10 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); return; } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - assert(opcode >= FI_OPX_HFI_BTH_OPCODE_MSG_INJECT); /* search the match queue */ @@ -2832,8 +2857,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, assert(static_flags & (FI_TAGGED | FI_MSG)); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; - union fi_opx_context * context = opx_ep->rx->queue[kind].mq.head; - union fi_opx_context * prev = NULL; + struct opx_context *context = (struct opx_context *) opx_ep->rx->queue[kind].mq.head; + struct opx_context *prev = NULL; while (OFI_LIKELY(context != NULL) && !is_match(opx_ep, @@ -2841,7 +2866,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, context, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, - is_intranode)) { + is_intranode, + slid)) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "context = %p\n", context); prev = context; context = context->next; @@ -2854,12 +2880,12 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, fi_opx_ep_rx_append_ue_tag(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); } else { fi_opx_ep_rx_append_ue_msg(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); } return; @@ -2876,28 +2902,27 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, if (prev) prev->next = context->next; else { - assert(opx_ep->rx->queue[kind].mq.head == context); - opx_ep->rx->queue[kind].mq.head = context->next; + assert(opx_ep->rx->queue[kind].mq.head == (struct slist_entry *) context); + opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next; } if (context->next == NULL){ - assert(opx_ep->rx->queue[kind].mq.tail == context); - opx_ep->rx->queue[kind].mq.tail = prev; + assert(opx_ep->rx->queue[kind].mq.tail == (struct slist_entry *) context); + opx_ep->rx->queue[kind].mq.tail = (struct slist_entry *) prev; } context->next = NULL; - complete_receive_operation(ep, hdr, payload, + opx_ep_complete_receive_operation(ep, hdr, payload, hdr->match.ofi_tag, context, opcode, - rx_op_flags & FI_OPX_CQ_CONTEXT_EXT, OPX_MULTI_RECV_FALSE, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); return; - } /* @@ -2907,17 +2932,16 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, const uint64_t recv_len = context->len; const uint64_t send_len = fi_opx_hfi1_packet_hdr_message_length(hdr); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); if (OFI_LIKELY(send_len <= recv_len)) { - complete_receive_operation(ep, hdr, payload, + opx_ep_complete_receive_operation(ep, hdr, payload, 0, context, opcode, - OPX_CONTEXT_EXTENDED_FALSE, OPX_MULTI_RECV_TRUE, is_intranode, OPX_HMEM_FALSE, lock_required, - reliability); + reliability, + hfi1_type); if (context->len < opx_ep->rx->min_multi_recv) { /* after processing this message there is not @@ -2930,7 +2954,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, if (prev) prev->next = context->next; else - opx_ep->rx->queue[kind].mq.head = context->next; + opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next; if (context->next == NULL) opx_ep->rx->queue[kind].mq.tail = NULL; @@ -2942,7 +2966,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, // to ensure that any pending ops are completed (eg rendezvous multi-receive) if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } if(context->byte_counter == 0) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_completed_ptr); } } } else { @@ -3097,47 +3121,35 @@ void fi_opx_ep_do_pending_work(struct fi_opx_ep *opx_ep) fi_opx_ep_do_pending_sdma_work(opx_ep); } -static inline -void fi_opx_ep_rx_poll (struct fid_ep *ep, - const uint64_t caps, - const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) +__OPX_FORCE_INLINE__ +void fi_opx_ep_rx_poll_internal (struct fid_ep *ep, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const enum ofi_reliability_kind kind = opx_ep->reliability->state.kind; const uint64_t rx_caps = (caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) ? caps : opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); - if ( OFI_LIKELY((reliability == OPX_RELIABILITY) && (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME)) ) { /* constant compile-time expression */ + if (OFI_LIKELY(hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME)) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); - } else if ( (reliability == OFI_RELIABILITY_KIND_RUNTIME) && (hdrq_mask == FI_OPX_HDRQ_MASK_2048) ) { /* constant compile-time expression */ - if (kind == OFI_RELIABILITY_KIND_ONLOAD) { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048); - } else { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048); - } - } else if ( (reliability == OFI_RELIABILITY_KIND_RUNTIME) && (hdrq_mask == FI_OPX_HDRQ_MASK_8192) ) { /* constant compile-time expression */ - if (kind == OFI_RELIABILITY_KIND_ONLOAD) { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192); - } else { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192); - } - } else if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, + hfi1_type); + } else if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, FI_OPX_HDRQ_MASK_2048); - } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { /* constant compile-time expression */ + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, + hfi1_type); + } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, FI_OPX_HDRQ_MASK_8192); + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, + hfi1_type); } else { FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, hdrq_mask); + OFI_RELIABILITY_KIND_ONLOAD, hdrq_mask, + hfi1_type); } fi_opx_ep_do_pending_work(opx_ep); @@ -3149,49 +3161,52 @@ void fi_opx_ep_rx_poll (struct fid_ep *ep, } } +static inline +void fi_opx_ep_rx_poll (struct fid_ep *ep, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_JKR_9B); + } else { + abort(); + } +} + __OPX_FORCE_INLINE__ -int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, +int fi_opx_ep_cancel_context(struct fi_opx_ep *opx_ep, const uint64_t cancel_context, - union fi_opx_context * context, + struct opx_context *context, const uint64_t rx_op_flags, - const uint64_t is_context_ext, const int lock_required) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "unimplemented; abort\n"); abort(); - const uint64_t compare_context = is_context_ext ? - (uint64_t)(((struct fi_opx_context_ext *)context)->msg.op_context) : - (uint64_t)context; + const uint64_t compare_context = (uint64_t) context->err_entry.op_context; if (compare_context == cancel_context) { - struct fi_opx_context_ext * ext; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - return -FI_ENOMEM; - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - } - - ext->opx_context.byte_counter = 0; - ext->err_entry.op_context = (void *)cancel_context; - ext->err_entry.flags = rx_op_flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = context->tag; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ECANCELED; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->byte_counter = 0; + context->err_entry.flags = rx_op_flags; + context->err_entry.len = 0; + context->err_entry.buf = 0; + context->err_entry.data = 0; + context->err_entry.tag = context->tag; + context->err_entry.olen = 0; + context->err_entry.err = FI_ECANCELED; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; /* post an 'error' completion event for the canceled receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); return FI_ECANCELED; } @@ -3202,11 +3217,11 @@ int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, __OPX_FORCE_INLINE__ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t is_context_ext, + struct opx_context * context, const uint64_t is_hmem, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(static_flags & (FI_TAGGED | FI_MSG)); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; @@ -3219,7 +3234,7 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, "searching unexpected queue\n"); __attribute__((__unused__)) bool from_hash_queue = false; - struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind); + struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind, hfi1_type); #ifndef FI_OPX_MATCH_HASH_DISABLE if (!uepkt && kind == FI_OPX_KIND_TAG) { @@ -3231,65 +3246,70 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, #endif if (uepkt) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "found a match, uepkt = %p\n", uepkt); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "UEPKT found a match, uepkt = %p\n", uepkt); - uint8_t is_mp_eager = (uepkt->hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - uepkt->hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); + uint8_t is_mp_eager = (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(uepkt->hdr.bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); - const unsigned is_intranode = fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid); + const unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (is_mp_eager) { - complete_receive_operation_internal(ep, - &uepkt->hdr, - &uepkt->payload, - uepkt->hdr.match.ofi_tag, - &context, - uepkt->hdr.stl.bth.opcode, - is_context_ext, - OPX_MULTI_RECV_FALSE, - is_intranode, - is_hmem, - lock_required, - reliability); + opx_ep_complete_receive_operation(ep, + &uepkt->hdr, + &uepkt->payload, + uepkt->hdr.match.ofi_tag, + context, + uepkt->hdr.bth.opcode, + OPX_MULTI_RECV_FALSE, + is_intranode, + is_hmem, + lock_required, + reliability, + hfi1_type); /* Since this is the first multi-packet eager packet, the uid portion of the mp_egr_id will be this packet's PSN */ + uint64_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint64_t)uepkt->hdr.lrh_9B.slid; + } else { + slid = htons((uint64_t)((uepkt->hdr.lrh_16B.slid20 << 20) | (uepkt->hdr.lrh_16B.slid))); + } const union fi_opx_mp_egr_id mp_egr_id = { .uid = uepkt->hdr.reliability.psn, .origin_tx = uepkt->hdr.reliability.origin_tx, - .slid = uepkt->hdr.stl.lrh.slid, + .slid = slid, .unused = 0 }; - fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability); + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, + lock_required, reliability, hfi1_type); if (context->byte_counter) { context->mp_egr_id = mp_egr_id; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->mp_egr_queue.mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->mp_egr_queue.mq); } else { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_process_context); context->next = NULL; - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } } } else { - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &uepkt->hdr, &uepkt->payload, uepkt->hdr.match.ofi_tag, context, - uepkt->hdr.stl.bth.opcode, - is_context_ext, + uepkt->hdr.bth.opcode, OPX_MULTI_RECV_FALSE, is_intranode, is_hmem, lock_required, - reliability); + reliability, + hfi1_type); } #ifndef FI_OPX_MATCH_HASH_DISABLE @@ -3302,7 +3322,6 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, fi_opx_hfi1_ue_packet_slist_remove_item(uepkt, &opx_ep->rx->queue[kind].ue); #endif - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } @@ -3313,19 +3332,16 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, * (context) to the appropriate match queue */ context->next = NULL; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->queue[kind].mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->queue[kind].mq); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } /* rx_op_flags is only checked for FI_PEEK | FI_CLAIM | FI_MULTI_RECV - * rx_op_flags is only used if FI_PEEK | FI_CLAIM | cancel_context - * is_context_ext is only used if FI_PEEK | cancel_context | iovec + * rx_op_flags is only used if FI_PEEK | FI_CLAIM * * The "normal" data movement functions, such as fi_[t]recv(), can safely - * specify '0' for cancel_context, rx_op_flags, and is_context_ext, in - * order to reduce code path. + * specify '0' for rx_op_flags in order to reduce code path. * * TODO - use payload pointer? keep data in hfi eager buffer as long * as possible to avoid memcpy? @@ -3334,35 +3350,24 @@ __OPX_FORCE_INLINE__ int fi_opx_ep_rx_process_context ( struct fi_opx_ep * opx_ep, const uint64_t static_flags, - const uint64_t cancel_context, union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - if (cancel_context) { /* branch should compile out */ - int rc = fi_opx_ep_cancel_context(opx_ep, cancel_context, context, - rx_op_flags, is_context_ext, lock_required); - - if (rc != FI_SUCCESS) return rc; - } - if (OFI_LIKELY((rx_op_flags & (FI_PEEK | FI_CLAIM | FI_MULTI_RECV)) == 0)) { if (is_hmem) { /* branch should compile out */ - assert(is_context_ext); return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, - OPX_CONTEXT_EXTENDED_TRUE, - OPX_HMEM_TRUE, - lock_required, reliability); + OPX_HMEM_TRUE, lock_required, + reliability, hfi1_type); } return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, - OPX_CONTEXT_EXTENDED_FALSE, - OPX_HMEM_FALSE, - lock_required, reliability); + OPX_HMEM_FALSE, lock_required, + reliability, hfi1_type); } else { /* @@ -3373,10 +3378,10 @@ int fi_opx_ep_rx_process_context ( "process peek, claim, or multi-receive context\n"); fi_opx_ep_rx_process_context_noinline(opx_ep, static_flags, - context, rx_op_flags, is_context_ext, is_hmem, lock_required, av_type, reliability); + context, rx_op_flags, is_hmem, lock_required, av_type, + reliability, hfi1_type); } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } @@ -3414,10 +3419,12 @@ fi_addr_t fi_opx_ep_get_src_addr(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *user_context, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(((static_flags & (FI_TAGGED | FI_MSG)) == FI_TAGGED) || ((static_flags & (FI_TAGGED | FI_MSG)) == FI_MSG)); @@ -3426,39 +3433,44 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_TAGGED, opx_ep->debug_counters.recv.posted_recv_tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECV: context = %p\n", context); + "===================================== POST RECV: context = %p\n", + user_context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "POST-RECV"); + + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } const uint64_t rx_op_flags = opx_ep->rx->op_flags; uint64_t rx_caps = opx_ep->rx->caps; - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - union fi_opx_context * opx_context = (union fi_opx_context *)context; - opx_context->flags = rx_op_flags; - opx_context->len = len; - opx_context->buf = buf; + context->next = NULL; + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->flags = rx_op_flags; + context->len = len; + context->buf = buf; + context->src_addr = (rx_caps & FI_DIRECTED_RECV) + ? fi_opx_ep_get_src_addr(opx_ep, av_type, src_addr) + : FI_ADDR_UNSPEC; + context->tag = tag; + context->ignore = ignore; + context->byte_counter = (uint64_t)-1; - if (rx_caps & FI_DIRECTED_RECV) { - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, src_addr); - } else { - opx_context->src_addr = FI_ADDR_UNSPEC; - } #ifdef FI_OPX_TRACE fprintf(stderr,"fi_opx_recv_generic from source addr:\n"); - FI_OPX_ADDR_DUMP(&opx_context->src_addr); + FI_OPX_ADDR_DUMP(&context->src_addr); #endif - opx_context->tag = tag; - opx_context->ignore = ignore; - opx_context->byte_counter = (uint64_t)-1; - assert(IS_PROGRESS_MANUAL(opx_ep->domain)); if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "process context (check unexpected queue, append match queue)\n"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "process context (check unexpected queue, append match queue)\n"); #ifdef OPX_HMEM uint64_t hmem_device; @@ -3466,48 +3478,39 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_MSG, opx_ep->debug_counters.hmem.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_TAGGED, opx_ep->debug_counters.hmem.posted_recv_tag); - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECV RETURN FI_ENOMEM\n"); - return -FI_ENOMEM; - } - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &ext->hmem_info_qws[0]; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &context->hmem_info_qws[0]; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; hmem_info->hmem_dev_reg_handle = ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle; - ext->err_entry.err = 0; - ext->opx_context = *opx_context; - ext->opx_context.flags = rx_op_flags | FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; - ext->msg.op_context = (struct fi_context2 *) context; + context->flags |= FI_OPX_CQ_CONTEXT_HMEM; fi_opx_ep_rx_process_context(opx_ep, static_flags, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, + context, 0, // rx_op_flags - OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } else #endif { fi_opx_ep_rx_process_context(opx_ep, static_flags, - OPX_CANCEL_CONTEXT_FALSE, context, 0, // rx_op_flags - OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECV RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECV"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== POST RECV RETURN\n"); return 0; } @@ -3527,18 +3530,26 @@ static inline ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, const struct fi_msg *msg, uint64_t flags, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "POST-RECVMSG"); FI_OPX_DEBUG_COUNTERS_INC_COND(!(flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND((flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_multi_recv); assert(!lock_required); - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ - if (OFI_LIKELY(flags & FI_MULTI_RECV)) { - union fi_opx_context * opx_context = (union fi_opx_context *) msg->context; + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); + return -FI_ENOMEM; + } + context->next = NULL; + context->err_entry.err = 0; + context->err_entry.op_context = msg->context; + if (OFI_LIKELY(flags & FI_MULTI_RECV)) { uint64_t len = msg->msg_iov[0].iov_len; void * base = msg->msg_iov[0].iov_base; @@ -3550,47 +3561,43 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, base = (void *)new_base; } assert(((uintptr_t)base & 0x07ull) == 0); - assert(len >= (sizeof(union fi_opx_context) + opx_ep->rx->min_multi_recv)); - opx_context->flags = FI_MULTI_RECV; - opx_context->len = len - sizeof(union fi_opx_context); - opx_context->buf = (void *)((uintptr_t)base + sizeof(union fi_opx_context)); - opx_context->next = NULL; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->byte_counter = 0; - opx_context->multi_recv_next = (union fi_opx_context *)base; - opx_context->ignore = (uint64_t)-1; + assert(len >= (sizeof(struct opx_context) + opx_ep->rx->min_multi_recv)); + context->flags = FI_MULTI_RECV; + context->len = len - sizeof(struct opx_context); + context->buf = (void *)((uintptr_t)base + sizeof(struct opx_context)); + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->byte_counter = 0; + context->ignore = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } else if (msg->iov_count == 0) { - union fi_opx_context * opx_context = (union fi_opx_context *) msg->context; - opx_context->flags = flags; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->next = NULL; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->tag = 0; - opx_context->ignore = (uint64_t)-1; - opx_context->byte_counter = (uint64_t)-1; + context->flags = flags; + context->len = 0; + context->buf = NULL; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->byte_counter = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } @@ -3615,97 +3622,74 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, #endif if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.posted_recv_msg); - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECVMSG (HMEM) RETURN FI_ENOMEM\n"); - return -FI_ENOMEM; - } - - ext->err_entry.err = 0; - ext->opx_context.flags = flags | FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; - ext->opx_context.len = msg->msg_iov[0].iov_len; - ext->opx_context.buf = msg->msg_iov[0].iov_base; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - ext->opx_context.tag = 0; - ext->opx_context.ignore = (uint64_t)-1; - ext->msg.op_context = (struct fi_context2 *)msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; - - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &ext->hmem_info_qws[0]; + context->flags = flags | FI_OPX_CQ_CONTEXT_HMEM; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->byte_counter = (uint64_t)-1; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; + + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &context->hmem_info_qws[0]; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, ext->opx_context.flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, context->flags, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG (HMEM) RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } #endif if (msg->iov_count == 1) { - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ - - union fi_opx_context * opx_context = - (union fi_opx_context *) msg->context; - opx_context->flags = flags; - opx_context->len = msg->msg_iov[0].iov_len; - opx_context->buf = msg->msg_iov[0].iov_base; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->tag = 0; - opx_context->ignore = (uint64_t)-1; - opx_context->byte_counter = (uint64_t)-1; + context->flags = flags; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->byte_counter = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } /* msg->iov_count > 1 */ - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG RETURN FI_ENOMEM\n"); - return -FI_ENOMEM; - } - - ext->opx_context.flags = flags | FI_OPX_CQ_CONTEXT_EXT; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - ext->opx_context.tag = 0; - ext->opx_context.ignore = (uint64_t)-1; - ext->msg.op_context = (struct fi_context2 *)msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->flags = flags; + context->byte_counter = (uint64_t)-1; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, - ext->opx_context.flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); - + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } @@ -3759,7 +3743,7 @@ void fi_opx_ep_tx_cq_completion_rzv(struct fid_ep *ep, struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); assert(context); assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - union fi_opx_context * opx_context = (union fi_opx_context *)context; + struct opx_context *opx_context = (struct opx_context *) context; opx_context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); opx_context->len = len; opx_context->buf = NULL; /* receive data buffer */ @@ -3767,7 +3751,7 @@ void fi_opx_ep_tx_cq_completion_rzv(struct fid_ep *ep, opx_context->next = NULL; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(opx_context, opx_ep->tx->cq_pending_ptr); + slist_insert_tail((struct slist_entry *) opx_context, opx_ep->tx->cq_pending_ptr); } __OPX_FORCE_INLINE__ @@ -3780,106 +3764,153 @@ ssize_t fi_opx_hfi1_tx_send_try_mp_egr (struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; assert (!fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)); - assert (len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE); + assert (len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)); const uint64_t bth_rx = ((uint64_t)addr.hfi1_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); /* Write the first packet */ uint32_t first_packet_psn; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (begin)\n"); uint8_t *buf_bytes_ptr = (uint8_t *) buf; - ssize_t rc = fi_opx_hfi1_tx_send_mp_egr_first (opx_ep, (void **) &buf_bytes_ptr, len, desc, - opx_ep->hmem_copy_buf, pbc_dlid, bth_rx, lrh_dlid, - addr, tag, data, lock_required, - caps, reliability, &first_packet_psn, - hmem_iface, hmem_device); + ssize_t rc; + rc = fi_opx_hfi1_tx_send_mp_egr_first_common (opx_ep, (void **) &buf_bytes_ptr, len, desc, + opx_ep->hmem_copy_buf, pbc_dlid, bth_rx, lrh_dlid, + addr, tag, data, lock_required, + tx_op_flags, caps, reliability, &first_packet_psn, + hmem_iface, hmem_device, hfi1_type); if (rc != FI_SUCCESS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_fall_back_to_rzv); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (return %zd)\n", rc); + return rc; } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_first_packets); /* The first packet was successful. We're now committed to finishing this */ - ssize_t payload_remaining = len - FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - uint32_t payload_offset = FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; + ssize_t payload_remaining = len - FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + uint32_t payload_offset = FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER FIRST NTH (payload_remaining %zu)\n", payload_remaining); /* Write all the full nth packets */ - while (payload_remaining >= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE) { - rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + while (payload_remaining >= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_nth_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } if (rc != FI_SUCCESS) { if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_force_cr); } else { - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); } do { - rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_nth_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } + if (rc == -FI_EAGAIN) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc != FI_SUCCESS); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_packets); - payload_remaining -= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - payload_offset += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; + payload_remaining -= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + payload_offset += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (payload_remaining %zu)\n", payload_remaining); } + /* Write all the last packet (if necessary) */ if (payload_remaining > 0) { - rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER LAST (payload_remaining %zu)\n", payload_remaining); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + payload_remaining, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_last_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } if (rc != FI_SUCCESS) { if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps,hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_force_cr); } else { - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); } do { - rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, + addr, lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_last_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, - addr, lock_required, reliability); + addr, lock_required, reliability, hfi1_type); + } if (rc == -FI_EAGAIN) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc != FI_SUCCESS); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_packets); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER LAST (payload_remaining %zu)\n", payload_remaining); + } if (OFI_LIKELY(do_cq_completion)) { fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (end)\n"); + return FI_SUCCESS; } @@ -3902,28 +3933,28 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const bool mp_eager_fallback) + const bool mp_eager_fallback, + const enum opx_hfi1_type hfi1_type) { ssize_t rc; - if(is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, - desc, addr.fi, tag, context, data, - lock_required, - override_flags, tx_op_flags, addr.hfi1_rx, - caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + desc, addr.fi, tag, context, data, + lock_required, + override_flags, tx_op_flags, addr.hfi1_rx, + caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, hfi1_type); } - if (OFI_LIKELY(rc == FI_SUCCESS)) { return rc; + #ifndef FI_OPX_MP_EGR_DISABLE } else if (rc == -FI_ENOBUFS && mp_eager_fallback) { /* Insufficient credits. If the payload is big enough, @@ -3935,11 +3966,11 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps,hfi1_type); } else { /* Likely full replay buffers or waiting for reliability handshake init. A poll might help */ - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } /* Note that we'll only iterate this loop more than once if we got here @@ -3948,20 +3979,20 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, do { if(is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, - desc, addr.fi, tag, context, data, - lock_required, - override_flags, tx_op_flags, addr.hfi1_rx, - caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + desc, addr.fi, tag, context, data, + lock_required, + override_flags, tx_op_flags, addr.hfi1_rx, + caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, hfi1_type); } - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } while (rc == -FI_ENOBUFS && loop++ < FI_OPX_EP_TX_SEND_EAGER_MAX_RETRIES); return rc; @@ -3981,56 +4012,35 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - union fi_opx_context * opx_context = (union fi_opx_context *)context; - uintptr_t byte_counter_ptr; - uint64_t *byte_counter; - uint64_t fake_cntr; ssize_t rc; - if (OFI_LIKELY(do_cq_completion != 0)) { - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - byte_counter_ptr = (uintptr_t) &opx_context->byte_counter; - byte_counter = (uint64_t *) &opx_context->byte_counter; - } else { - // Give a 'fake' counter here to 'value' part of the SEND_RZV. This - // does look a bit weird, but it saves from a few if checks in - // SEND_RZV and won't store a pointer to the stack variable - // fake_cntr in the RZV protocol headers - byte_counter_ptr = (uintptr_t) NULL; - byte_counter = (uint64_t *) &fake_cntr; - } - do { if (is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_RZV( ep, buf, len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device); + caps, reliability, + do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_RZV( ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device); + caps, reliability, + do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } + if (OFI_UNLIKELY(rc == -EAGAIN)) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc == -EAGAIN); - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_completion_rzv(ep, context, len, - lock_required, tag, caps); - } - return rc; } @@ -4045,7 +4055,8 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND (begin)\n"); @@ -4098,7 +4109,7 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, addr.hfi1_rx, addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); return -FI_EAGAIN; } @@ -4120,15 +4131,14 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, fi_opx_ep_tx_do_cq_completion(opx_ep, override_flags, tx_op_flags); if (total_len < opx_ep->tx->rzv_min_payload_bytes) { - const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE && + const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) && total_len <= opx_ep->tx->mp_eager_max_payload_bytes); if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) { - rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov, niov, total_len, data, lock_required, is_contiguous, override_flags, tx_op_flags, caps, reliability, do_cq_completion, hmem_iface, hmem_device, - mp_eager_fallback); + mp_eager_fallback, hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); return rc; @@ -4148,7 +4158,7 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, caps, reliability, do_cq_completion, - FI_HMEM_SYSTEM, 0ul); + FI_HMEM_SYSTEM, 0ul, hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); return rc; @@ -4179,7 +4189,8 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, + hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -4199,7 +4210,8 @@ ssize_t fi_opx_ep_tx_send(struct fid_ep *ep, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4208,7 +4220,8 @@ ssize_t fi_opx_ep_tx_send(struct fid_ep *ep, ssize_t rc = fi_opx_ep_tx_send_internal(ep, buf, len, desc, dest_addr, tag, context, data, FI_OPX_LOCK_NOT_REQUIRED, av_type, is_contiguous, override_flags, - tx_op_flags, caps, reliability); + tx_op_flags, caps, reliability, + hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -4225,8 +4238,10 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const uint32_t data, const int lock_required, const enum fi_av_type av_type, + uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { // Exactly one of FI_MSG or FI_TAGGED should be on assert((caps & (FI_MSG | FI_TAGGED)) && @@ -4248,7 +4263,8 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, FI_SELECTIVE_COMPLETION, // op flags to turn off context caps, - reliability); + reliability, + hfi1_type); } else { assert(len <= FI_OPX_HFI1_PACKET_IMM); } @@ -4269,7 +4285,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const union fi_opx_addr addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dest_addr); const ssize_t rc = FI_OPX_FABRIC_TX_INJECT(ep, buf, len, addr.fi, tag, data, - lock_required, addr.hfi1_rx, caps, reliability); + lock_required, addr.hfi1_rx, tx_op_flags, caps, reliability, hfi1_type); if (OFI_UNLIKELY(rc == -EAGAIN)) { // In this case we are probably out of replay buffers. To deal @@ -4277,7 +4293,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, // process any incoming ACKs, hopefully releasing a buffer for // reuse. fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "INJECT"); @@ -4296,8 +4312,10 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, const uint32_t data, const int lock_required, const enum fi_av_type av_type, + uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4305,7 +4323,7 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, ssize_t rc = fi_opx_ep_tx_inject_internal(ep, buf, len, dest_addr, tag, data, FI_OPX_LOCK_NOT_REQUIRED, av_type, - caps, reliability); + tx_op_flags, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -4318,14 +4336,15 @@ ssize_t fi_opx_recv_generic(struct fid_ep *ep, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_ep_rx_recv_internal(opx_ep, buf, len, desc, src_addr, tag, ignore, context, FI_OPX_LOCK_NOT_REQUIRED, av_type, - static_flags, reliability); + static_flags, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -4335,12 +4354,14 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_recvmsg_generic(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type ) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, reliability); + ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; diff --git a/prov/opx/include/rdma/opx/fi_opx_eq.h b/prov/opx/include/rdma/opx/fi_opx_eq.h index d77289f1de6..f386f05a2f8 100644 --- a/prov/opx/include/rdma/opx/fi_opx_eq.h +++ b/prov/opx/include/rdma/opx/fi_opx_eq.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2022 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,53 +46,53 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FORMAT, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS)\ +#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE)\ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_read_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_read_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count) \ { \ return fi_opx_cq_read_generic_non_locking(cq, buf, count, \ - FORMAT, RELIABILITY, MASK, CAPS); \ + FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_readfrom_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_readfrom_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count, \ fi_addr_t *src_addr) \ { \ return fi_opx_cq_readfrom_generic_non_locking(cq, buf, count, \ src_addr, FORMAT, RELIABILITY, MASK, \ - CAPS); \ + CAPS, HFI1_TYPE); \ } \ -#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FORMAT, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS)\ +#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE)\ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_read_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_read_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count) \ { \ return fi_opx_cq_read_generic_locking(cq, buf, count, \ - FORMAT, RELIABILITY, MASK, CAPS); \ + FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_readfrom_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_readfrom_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count, \ fi_addr_t *src_addr) \ { \ return fi_opx_cq_readfrom_generic_locking(cq, buf, count, \ src_addr, FORMAT, RELIABILITY, MASK, \ - CAPS); \ + CAPS, HFI1_TYPE); \ } \ -#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - fi_opx_ ## TYPE ## _ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS +#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE #ifdef __cplusplus @@ -128,9 +128,9 @@ struct fi_opx_cq { /* == CACHE LINE == */ - struct fi_opx_context_slist pending; - struct fi_opx_context_slist completed; - struct fi_opx_context_slist err; /* 'struct fi_opx_context_ext' element linked list */ + struct slist pending; + struct slist completed; + struct slist err; struct { uint64_t ep_count; @@ -139,7 +139,6 @@ struct fi_opx_cq { struct fi_opx_progress_track *progress_track; -// struct fi_opx_context_ext *err_tail; uint64_t pad_1[9]; struct fi_opx_domain *domain; @@ -174,60 +173,59 @@ int fi_opx_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, fprintf(stderr,"%s:%s():%d entry_id = %u\n", __FILE__, __func__, __LINE__, (entry)->recv.entry_id); \ }) -int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, - struct fi_opx_context_ext * ext, +int fi_opx_cq_enqueue_err (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required); struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_non_locking_8192_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_non_locking_runtime_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_2048_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type); void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); static inline -int fi_opx_cq_enqueue_pending (struct fi_opx_cq * opx_cq, - union fi_opx_context * context, +int fi_opx_cq_enqueue_pending (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { if (lock_required) { FI_WARN(fi_opx_global.prov, FI_LOG_CQ, "unimplemented\n"); abort(); } - union fi_opx_context * tail = opx_cq->pending.tail; - context->next = NULL; - if (tail) { - tail->next = context; - } else { - opx_cq->pending.head = context; - } - opx_cq->pending.tail = context; + slist_insert_tail((struct slist_entry *) context, &opx_cq->pending); return 0; } static inline -int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, - union fi_opx_context * context, +int fi_opx_cq_enqueue_completed (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { assert(0 == context->byte_counter); @@ -237,19 +235,7 @@ int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== MANUAL PROGRESS COMPLETION CQ ENQUEUED\n"); - union fi_opx_context * tail = opx_cq->completed.tail; - context->next = NULL; - if (tail) { - - assert(NULL != opx_cq->completed.head); - tail->next = context; - opx_cq->completed.tail = context; - - } else { - assert(NULL == opx_cq->completed.head); - opx_cq->completed.head = context; - opx_cq->completed.tail = context; - } + slist_insert_tail((struct slist_entry *) context, &opx_cq->completed); return 0; } @@ -257,11 +243,10 @@ int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, static inline size_t fi_opx_cq_fill(uintptr_t output, - union fi_opx_context * context, + struct opx_context *context, const enum fi_cq_format format) { assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); const uint64_t is_multi_recv = context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV; size_t return_size; @@ -289,9 +274,9 @@ static inline size_t fi_opx_cq_fill(uintptr_t output, } if (OFI_LIKELY(!is_multi_recv)) { - entry->op_context = (void *)context; + entry->op_context = context->err_entry.op_context; } else { - entry->op_context = (void *)context->multi_recv_context; + entry->op_context = ((struct opx_context *)context->multi_recv_context)->err_entry.op_context; } return return_size; @@ -314,74 +299,85 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, /* examine each context in the pending completion queue and, if the * operation is complete, initialize the cq entry in the application * buffer and remove the context from the queue. */ - union fi_opx_context * pending_head = opx_cq->pending.head; - union fi_opx_context * pending_tail = opx_cq->pending.tail; + struct opx_context *pending_head = (struct opx_context *) opx_cq->pending.head; + struct opx_context *pending_tail = (struct opx_context *) opx_cq->pending.tail; if (NULL != pending_head) { - union fi_opx_context * context = pending_head; - union fi_opx_context * prev = NULL; + struct opx_context *context = pending_head; + struct opx_context *prev = NULL; while ((count - num_entries) > 0 && context != NULL) { const uint64_t byte_counter = context->byte_counter; if (byte_counter == 0) { + bool free_context; if (context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV) { assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); - union fi_opx_context *multi_recv_context = context->multi_recv_context; + struct opx_context *multi_recv_context = context->multi_recv_context; assert(multi_recv_context != NULL); multi_recv_context->byte_counter-=1; assert(((int64_t)multi_recv_context->byte_counter) >= 0); // Reusing byte counter as pending flag // re-using tag to store the min multi_receive struct fi_opx_ep * opx_ep = (struct fi_opx_ep *)multi_recv_context->tag; - if(multi_recv_context->len < opx_ep->rx->min_multi_recv && - multi_recv_context->byte_counter == 0) { + if (multi_recv_context->len < opx_ep->rx->min_multi_recv && + multi_recv_context->byte_counter == 0) { /* Signal the user to repost their buffers */ assert(multi_recv_context->next == NULL); - fi_opx_context_slist_insert_tail(multi_recv_context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) multi_recv_context, + opx_ep->rx->cq_completed_ptr); } - } else if (context->flags & FI_OPX_CQ_CONTEXT_EXT) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) context; - context = (union fi_opx_context *) ext->msg.op_context; - *context = ext->opx_context; - context->flags &= ~(FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM); - OPX_BUF_FREE(ext); + free_context = false; + } else { + free_context = true; } + context->flags &= ~FI_OPX_CQ_CONTEXT_HMEM; output += fi_opx_cq_fill(output, context, format); - ++ num_entries; + ++num_entries; - if (prev) + if (prev) { prev->next = context->next; - else + } else { /* remove the head */ pending_head = context->next; + } + + struct opx_context *next = context->next; - if (!(context->next)) + if (!next) { /* remove the tail */ pending_tail = prev; - } - else + } + if (free_context) { + OPX_BUF_FREE(context); + } + context = next; + } else { prev = context; - context = context->next; + context = context->next; + } } /* save the updated pending head and pending tail pointers */ - opx_cq->pending.head = pending_head; - opx_cq->pending.tail = pending_tail; + opx_cq->pending.head = (struct slist_entry *) pending_head; + opx_cq->pending.tail = (struct slist_entry *) pending_tail; } - union fi_opx_context * head = opx_cq->completed.head; + struct opx_context *head = (struct opx_context *) opx_cq->completed.head; if (head) { - union fi_opx_context * context = head; + struct opx_context *context = head; while ((count - num_entries) > 0 && context != NULL) { output += fi_opx_cq_fill(output, context, format); - ++ num_entries; - context = context->next; + ++num_entries; + struct opx_context *next = context->next; + if (!(context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV)) { + OPX_BUF_FREE(context); + } + context = next; } - opx_cq->completed.head = context; + opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; } @@ -396,7 +392,8 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, const int lock_required, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { ssize_t num_entries = 0; @@ -423,35 +420,35 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); } } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); - } - + } + } else { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); } } } else { if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048, hfi1_type); } } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192, hfi1_type); } } else { for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } } @@ -475,13 +472,17 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, if (0 == (tmp_eh | tmp_ph)) { uintptr_t output = (uintptr_t) buf; - union fi_opx_context * context = (union fi_opx_context *)tmp_ch; + struct opx_context *context = (struct opx_context *) tmp_ch; while ((count - num_entries) > 0 && context != NULL) { output += fi_opx_cq_fill(output, context, format); ++ num_entries; - context = context->next; + struct opx_context *next = context->next; + if (!(context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV)) { + OPX_BUF_FREE(context); + } + context = next; } - opx_cq->completed.head = context; + opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; return num_entries; @@ -504,9 +505,10 @@ ssize_t fi_opx_cq_read_generic_non_locking (struct fid_cq *cq, void *buf, size_t const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { - return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps); + return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); } __OPX_FORCE_INLINE__ @@ -514,11 +516,12 @@ ssize_t fi_opx_cq_read_generic_locking (struct fid_cq *cq, void *buf, size_t cou const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { int ret; fi_opx_lock(&((struct fi_opx_cq *) cq)->lock); - ret = fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); fi_opx_unlock(&((struct fi_opx_cq *) cq)->lock); return ret; @@ -529,10 +532,11 @@ ssize_t fi_opx_cq_readfrom_generic_non_locking (struct fid_cq *cq, void *buf, si const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { int ret; - ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); if (ret > 0) { unsigned n; for (n=0; nlock); - ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); fi_opx_unlock(&((struct fi_opx_cq *) cq)->lock); if (ret > 0) { unsigned n; diff --git a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h index 7222d220b09..fd7c339a36a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,14 +38,16 @@ #ifdef FI_OPX_FABRIC_HFI1 #include "rdma/opx/fi_opx_hfi1_transport.h" -#define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject -#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr -#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr -#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv -#define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv -#define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts -#define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts -#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput +#define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject +#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select +#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select +#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv_select +#define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv +#define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts +#define FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC fi_opx_hfi1_rx_rzv_rts_etrunc +#define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts +#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput + #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h b/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h index 658b99bf321..249b3d1d328 100644 --- a/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h +++ b/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,7 +41,8 @@ #include "fi_opx_hfi1_packet.h" #include "fi_opx_timer.h" -#define FLIGHT_RECORDER_ENTRY_DATA_LEN (sizeof(union fi_opx_hfi1_packet_payload) + sizeof(union fi_opx_hfi1_packet_hdr)) +#define FLIGHT_RECORDER_ENTRY_DATA_LEN (sizeof(union fi_opx_hfi1_packet_payload) + sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)) + #ifndef FLIGHT_RECORDER_ENTRY_COUNT #define FLIGHT_RECORDER_ENTRY_COUNT (1024) #endif @@ -187,8 +188,8 @@ void flight_recorder_dump (struct flight_recorder * fr) } if (entry[i].type == FR_ENTRY_TYPE_PACKET_HDR) { fprintf(stderr, "PACKET HDR|\n"); - fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)entry[i].data, - "#FLIGHT_RECORDER", 0); + //fi_opx_hfi1_dump_packet_hdr((union opx_hfi1_packet_hdr *)entry[i].data, + // "#FLIGHT_RECORDER", 0); } else if (entry[i].type == FR_ENTRY_TYPE_PACKET) { flight_recorder_dump_packet_payload(entry); } else { @@ -220,7 +221,7 @@ void flight_recorder_dump (struct flight_recorder * fr) flight_recorder_init_next_entry((fr), (event_id), \ FR_ENTRY_TYPE_PACKET_HDR); \ memcpy((void *)next->data, (void *) &(packet_hdr), \ - sizeof(union fi_opx_hfi1_packet_hdr)); \ + sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); \ if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \ flight_recorder_dump((fr)); \ } diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index b59ee0054b4..2c71240c4ab 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -52,6 +52,7 @@ #include "rdma/opx/opx_hfi1_sim.h" #include "rdma/opx/fi_opx_hfi1_version.h" +#include "rdma/opx/fi_opx_timer.h" // #define FI_OPX_TRACE 1 @@ -91,12 +92,22 @@ #define OPX_MP_EGR_DISABLE_NOT_SET (0) #define OPX_MP_EGR_DISABLE_DEFAULT (OPX_MP_EGR_DISABLE_NOT_SET) -#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) /* Default for payload threshold size for RZV */ -#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ +/* Default for payload threshold size for RZV */ +#if HAVE_CUDA +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (4096) +#elif HAVE_ROCR +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (256) +#else +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) +#endif #define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */ +#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ + +/* The PBC length to use for a single packet in a multi-packet eager send. + + This is packet payload plus the PBC plus the packet header plus + tail (16B only). -/* The total size for a single packet used in a multi-packet eager send. - This is packet payload plus 64 bytes for the PBC and packet header. All packets in a multi-packet eager send will be this size, except possibly the last one, which may be smaller. @@ -105,14 +116,24 @@ #define FI_OPX_MP_EGR_CHUNK_SIZE (4160) /* For full MP-Eager chunks, we pack 16 bytes of payload data in the - packet header. So the actual payload size for a full chunk is the - total chunk size minus 64 bytes for PBC and packet header, plus 16 - bytes for the space we use for payload data in the packet header. - Or, more simply, 48 bytes less than the total chunk size. */ -#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE (FI_OPX_MP_EGR_CHUNK_SIZE - 48) -#define FI_OPX_MP_EGR_CHUNK_CREDITS (FI_OPX_MP_EGR_CHUNK_SIZE >> 6) -#define FI_OPX_MP_EGR_CHUNK_DWS (FI_OPX_MP_EGR_CHUNK_SIZE >> 2) -#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS (FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE >> 3) + packet header. + + So the actual user payload __consumed__ for a full chunk is the + FI_OPX_MP_EGR_CHUNK_SIZE minus the PBC minus the header minus + the tail (16B only) plus 16 bytes payload packed in the header. + + The payload itself will be FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE - 16 + */ + +#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) \ + ((hfi1_type & OPX_HFI1_JKR) \ + ? (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 64 /* hdr */ + 8 /* tail */) - 16 /* payload */)) \ + : (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 56 /* hdr */) - 16 /* payload */))) + +#define FI_OPX_MP_EGR_CHUNK_CREDITS (FI_OPX_MP_EGR_CHUNK_SIZE >> 6) /* PACKET CREDITS TOTAL */ +#define FI_OPX_MP_EGR_CHUNK_DWS (FI_OPX_MP_EGR_CHUNK_SIZE >> 2) /* PBC DWS */ +#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type) \ + ((FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) >> 3) /* PAYLOAD QWS CONSUMED */ #define FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL 16 #define FI_OPX_MP_EGR_XFER_BYTES_TAIL 0x0010000000000000ull @@ -183,15 +204,38 @@ static_assert(OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX >= OPX_MP_EGR_MAX_PAYLOAD_BYTES_D #define FI_OPX_HFI1_SDMA_MAX_COMP_INDEX (128) // This should what opx_ep->hfi->info.sdma.queue_size is set to. +/* Default for payload threshold size for SDMA */ #ifndef FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) + #if HAVE_CUDA + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #elif HAVE_ROCR + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (256) + #else + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) + #endif +#endif +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min Value */ +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) /* Max Value */ + +/* Default for payload threshold size for TID */ +#ifndef OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT + #if HAVE_CUDA + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #elif HAVE_ROCR + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #else + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #endif #endif -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) +#define OPX_TID_MIN_PAYLOAD_BYTES_MIN (OPX_HFI1_TID_PAGESIZE) +static_assert(OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT >= OPX_TID_MIN_PAYLOAD_BYTES_MIN, + "OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT must be >= OPX_TID_MIN_PAYLOAD_BYTES_MIN!\n"); -static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); -static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n"); +static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), + "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); +static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, + "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n"); /* * SDMA includes 8B sdma hdr, 8B PBC, and message header. @@ -212,24 +256,105 @@ abort(); return 0; } -struct fi_opx_hfi1_txe_scb { +/* Also refer to union opx_hfi1_packet_hdr comment - union { - uint64_t qw0; /* a.k.a. 'struct hfi_pbc' */ - //struct hfi_pbc pbc; - }; - union fi_opx_hfi1_packet_hdr hdr; + SCB (Send Control Block) is 8 QW's written to PIO SOP. + + Optimally, store 8 contiguous QW's. + + Cannot define a common 9B/16B structure that is contiguous, + so send code is 9B/16B aware. + + TX SCB + ===================================================== + GENERIC 9B 16B + ========= ================== =================== +QW[0] PBC +QW[1] HDR qw_9B[0] LRH qw_16B[0] LRH +QW[2] HDR qw_9B[1] BTH qw_16B[1] LRH +QW[3] HDR qw_9B[2] BTH/KDETH qw_16B[2] BTH +QW[4] HDR qw_9B[3] KDETH qw_16B[3] BTH/KDETH +QW[5] HDR qw_9B[4] USER/SW qw_16B[4] KDETH +QW[6] HDR qw_9B[5] USER/SW qw_16B[5] USER/SW +QW[7] HDR qw_9B[6] USER/SW qw_16B[6] USER/SW + + qw_16B[7] USER/SW + +Generic example + +// faster than memcpy() for this amount of data. +// SCB (PIO or UREG) COPY ONLY (STORE) +static inline void fi_opx_store_scb_qw(volatile uint64_t dest[8], const uint64_t source[8]) +{ + OPX_HFI1_BAR_STORE(&dest[0], source[0]); + OPX_HFI1_BAR_STORE(&dest[1], source[1]); + OPX_HFI1_BAR_STORE(&dest[2], source[2]); + OPX_HFI1_BAR_STORE(&dest[3], source[3]); + OPX_HFI1_BAR_STORE(&dest[4], source[4]); + OPX_HFI1_BAR_STORE(&dest[5], source[5]); + OPX_HFI1_BAR_STORE(&dest[6], source[6]); + OPX_HFI1_BAR_STORE(&dest[7], source[7]); +} + + +9B/16B example, must be hfi1-aware + + struct fi_opx_hfi1_txe_scb_9B model_9B = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + struct fi_opx_hfi1_txe_scb_16B model_16B = opx_ep->reliability->service.tx.hfi1.ping_model_16B; + + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + if ((hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { + OPX_HFI1_BAR_STORE(&scb[0], (model_9B.qw0 | OPX_PBC_CR(0x1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_9B.hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model_9B.hdr.qw_9B[1] | bth_rx)); +<...> + } else { + OPX_HFI1_BAR_STORE(&scb[0], (model_16B.qw0 | OPX_PBC_CR(1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B.hdr.qw_16B[0] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B.hdr.qw_16B[1] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B.hdr.qw_16B[2] | bth_rx); +<...> + } + +*/ + +/* 8 QWs valid in 16 QW storage. */ +struct fi_opx_hfi1_txe_scb_9B { + + union { /* 15 QWs union*/ -} __attribute__((__aligned__(8))); + /* pbc is qw0. it overlays hdr's unused_pad_9B */ + struct { + uint64_t qw0; + uint64_t qw[14]; + } __attribute__((__packed__)) __attribute__((__aligned__(8))); + union opx_hfi1_packet_hdr hdr; /* 1 QW unused + 7 QWs 9B header + 7 QWs unused*/ -struct fi_opx_hfi1_rxe_hdr { + } __attribute__((__packed__)) __attribute__((__aligned__(8))); - union fi_opx_hfi1_packet_hdr hdr; - uint64_t rhf; + uint64_t pad; /* 1 QW pad (to 16 QWs) */ +} __attribute__((__aligned__(8))) __attribute__((packed)); -} __attribute__((__aligned__(64))); +/* 9 QWs valid in 16 QW storage. */ +struct fi_opx_hfi1_txe_scb_16B { + uint64_t qw0; /* PBC */ + union opx_hfi1_packet_hdr hdr; /* 8 QWs 16B header + 7 QWs currently unused */ +} __attribute__((__aligned__(8))) __attribute__((packed)); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(struct fi_opx_hfi1_txe_scb_16B)), "storage for scbs should match"); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == (sizeof(uint64_t)*16)), "16 qw scb storage"); + +/* Storage for a scb. Use HFI1 type to access the correct structure */ +union opx_hfi1_txe_scb_union { + struct fi_opx_hfi1_txe_scb_9B scb_9B; + struct fi_opx_hfi1_txe_scb_16B scb_16B; +} __attribute__((__aligned__(8))) __attribute__((packed)); + +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(union opx_hfi1_txe_scb_union)), "storage for scbs should match"); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_16B) == sizeof(union opx_hfi1_txe_scb_union)), "storage for scbs should match"); @@ -388,7 +513,7 @@ struct fi_opx_hfi1_context { } info; int fd; - uint16_t lid; + uint32_t lid; struct _hfi_ctrl * ctrl; //struct hfi1_user_info_dep user_info; enum opx_hfi1_type hfi_hfi1_type; @@ -416,6 +541,11 @@ struct fi_opx_hfi1_context { } daos_info; int64_t ref_cnt; + size_t status_lasterr; + time_t network_lost_time; + union fi_opx_timer_stamp link_status_timestamp; + union fi_opx_timer_state link_status_timer; + uint64_t status_check_next_usec; }; struct fi_opx_hfi1_context_internal { @@ -485,12 +615,12 @@ void fi_opx_consume_credits(union fi_opx_hfi1_pio_state *pio_state, size_t count } #define FI_OPX_HFI1_CREDITS_IN_USE(pio_state) fi_opx_credits_in_use(&pio_state) -#define FI_OPX_HFI1_UPDATE_CREDITS(pio_state, pio_credits_addr) fi_opx_update_credits(&pio_state, pio_credits_addr); +#define FI_OPX_HFI1_UPDATE_CREDITS(pio_state, pio_credits_addr) fi_opx_update_credits(&pio_state, pio_credits_addr) #define FI_OPX_HFI1_PIO_SCB_HEAD(pio_scb_base, pio_state) fi_opx_pio_scb_base(pio_scb_base, &pio_state) #define FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, force_credit_return, credits_needed) fi_opx_credits_avail(&pio_state, force_credit_return, credits_needed) #define FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) fi_opx_reliability_credits_avail(&pio_state) #define FI_OPX_HFI1_CONSUME_CREDITS(pio_state, count) fi_opx_consume_credits(&pio_state, count) -#define FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state) FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); +#define FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state) FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1) __OPX_FORCE_INLINE__ @@ -516,7 +646,7 @@ int fi_opx_hfi1_get_lid_local_unit(uint16_t lid) } __OPX_FORCE_INLINE__ -bool fi_opx_hfi_is_intranode(uint16_t lid) +bool opx_lid_is_intranode(uint16_t lid) { if (fi_opx_global.hfi_local_info.lid == lid) { return true; @@ -525,6 +655,19 @@ bool fi_opx_hfi_is_intranode(uint16_t lid) return fi_opx_hfi1_get_lid_local(lid); } +__OPX_FORCE_INLINE__ +bool opx_lrh_is_intranode(union opx_hfi1_packet_hdr *hdr, const enum opx_hfi1_type hfi1_type) +{ + uint32_t lid_be; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lid_be = hdr->lrh_9B.slid; + } else { + lid_be = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + } + return opx_lid_is_intranode(lid_be); +} + struct fi_opx_hfi1_context * fi_opx_hfi1_context_open (struct fid_ep *ep, uuid_t unique_job_key); int init_hfi1_rxe_state (struct fi_opx_hfi1_context * context, @@ -537,7 +680,10 @@ void fi_opx_init_hfi_lookup(); */ #define FI_OPX_SHM_FIFO_SIZE (1024) #define FI_OPX_SHM_BUFFER_MASK (FI_OPX_SHM_FIFO_SIZE-1) -#define FI_OPX_SHM_PACKET_SIZE (FI_OPX_HFI1_PACKET_MTU + sizeof(struct fi_opx_hfi1_stl_packet_hdr)) + + +#define FI_OPX_SHM_PACKET_SIZE (FI_OPX_HFI1_PACKET_MTU + sizeof(union opx_hfi1_packet_hdr)) + #ifndef NDEBUG #define OPX_BUF_FREE(x) \ @@ -622,4 +768,64 @@ void opx_print_context(struct fi_opx_hfi1_context *context) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context ref_cnt %#lX \n",context->ref_cnt); } +void opx_reset_context(struct fi_opx_ep * opx_ep); + +#define OPX_CONTEXT_STATUS_CHECK_INTERVAL_USEC 250000 /* 250 ms*/ + +__OPX_FORCE_INLINE__ +uint64_t opx_get_hw_status(struct fi_opx_hfi1_context *context) +{ + struct hfi1_status *status = + (struct hfi1_status *) context->ctrl->base_info.status_bufbase; + + return((status->dev & (HFI1_STATUS_INITTED | HFI1_STATUS_CHIP_PRESENT | HFI1_STATUS_HWERROR)) + | (status->port & (HFI1_STATUS_IB_READY | HFI1_STATUS_IB_CONF))); +} + +#define OPX_HFI1_HW_CHIP_STATUS (HFI1_STATUS_CHIP_PRESENT | HFI1_STATUS_INITTED) +#define OPX_HFI1_IB_STATUS (HFI1_STATUS_IB_CONF | HFI1_STATUS_IB_READY) + +/* The linkup time duration for a system should allow the time needed + to complete 3 LNI passes which is: + 50 seconds for a passive copper channel + 65 seconds for optical channel. + (we add 5 seconds of margin.) */ +#define OPX_LINK_DOWN_MAX_SEC 70.0 + +__OPX_FORCE_INLINE__ +size_t fi_opx_context_check_status(struct fi_opx_hfi1_context *context) +{ + size_t err = FI_SUCCESS; + uint64_t status = opx_get_hw_status(context); + + /* Fatal chip-related errors */ + if (!((status & OPX_HFI1_HW_CHIP_STATUS) == OPX_HFI1_HW_CHIP_STATUS) || + (status & HFI1_STATUS_HWERROR)) { + err = FI_ENETUNREACH; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, "HFI1 chip error detected\n"); + abort(); + return(err); + } else if (!((status & OPX_HFI1_IB_STATUS) == OPX_HFI1_IB_STATUS)) { + err = FI_ENETDOWN; + if (err != context->status_lasterr) { + context->network_lost_time = time(NULL); + } else { + time_t now = time(NULL); + + if (difftime(now,context->network_lost_time) > OPX_LINK_DOWN_MAX_SEC) + { + fprintf(stderr, "Link has been down more than 70s. Aborting\n"); + abort(); + return(err); + } + } + } + + if (err != FI_SUCCESS) { + context->status_lasterr = err; /* record error */ + } + + return err; +} + #endif /* _FI_PROV_OPX_HFI1_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h index cd2310f0be3..300340ec1aa 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h @@ -40,7 +40,7 @@ __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_put( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -50,12 +50,20 @@ size_t opx_hfi1_dput_write_header_and_payload_put( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_PUT | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_PUT | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_PUT | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -100,7 +108,7 @@ void opx_hfi1_dput_write_payload_atomic_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_atomic_fetch( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -113,12 +121,20 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_fetch( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } size_t dput_bytes = payload_bytes - sizeof(struct fi_opx_hfi1_dput_fetch); @@ -184,7 +200,7 @@ void opx_hfi1_dput_write_payload_atomic_compare_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -200,12 +216,20 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } size_t dput_bytes = payload_bytes - sizeof(struct fi_opx_hfi1_dput_fetch); size_t dput_bytes_half = dput_bytes >> 1; @@ -242,7 +266,7 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_get( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t dt64, @@ -251,12 +275,20 @@ size_t opx_hfi1_dput_write_header_and_payload_get( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_GET | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_GET | + (dt64 << 16) | (payload_bytes << 48); + hdr->qw_9B[5] = rma_request_vaddr; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_GET | (dt64 << 16) | (payload_bytes << 48); - tx_hdr->qw[5] = rma_request_vaddr; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = rma_request_vaddr; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -289,7 +321,7 @@ size_t opx_hfi1_dput_write_header_and_payload_get( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_rzv( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -300,11 +332,18 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | (opcode) | (payload_bytes << 48); - tx_hdr->qw[5] = target_byte_counter_vaddr; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | (opcode) | (payload_bytes << 48); + hdr->qw_9B[5] = target_byte_counter_vaddr; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | (opcode) | (payload_bytes << 48); + hdr->qw_16B[6] = target_byte_counter_vaddr; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -325,7 +364,7 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint32_t opcode, @@ -347,50 +386,63 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { uint64_t psn = (uint64_t) htonl((uint32_t)psn_orig); - tx_hdr->qw[0] = opx_ep->rx->tx.dput.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.dput.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.dput.hdr.qw[2] | psn; - tx_hdr->qw[3] = opx_ep->rx->tx.dput.hdr.qw[3]; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2] | psn; + hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[3] | psn; + hdr->qw_16B[4] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[4]; + } switch(opcode) { case FI_OPX_HFI_DPUT_OPCODE_RZV: case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: case FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG: return opx_hfi1_dput_write_header_and_payload_rzv( - opx_ep, tx_hdr, tx_payload, iov, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, opcode, target_byte_counter_vaddr, sbuf, - sbuf_iface, sbuf_device, rbuf); + sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_GET: return opx_hfi1_dput_write_header_and_payload_get( - opx_ep, tx_hdr, tx_payload, iov, + opx_ep, hdr, tx_payload, iov, dt64, payload_bytes, rma_request_vaddr, - sbuf, sbuf_iface, sbuf_device, rbuf); + sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_PUT: return opx_hfi1_dput_write_header_and_payload_put( - opx_ep, tx_hdr, tx_payload, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, - key, sbuf, sbuf_iface, sbuf_device, rbuf); + key, sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH: return opx_hfi1_dput_write_header_and_payload_atomic_fetch( - opx_ep, tx_hdr, tx_payload, iov, op64, dt64, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, key, fetch_vaddr, rma_request_vaddr, bytes_sent, sbuf, - sbuf_iface, sbuf_device, rbuf); + sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH: return opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( - opx_ep, tx_hdr, tx_payload, iov, op64, dt64, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, key, fetch_vaddr, rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, - sbuf_device, cbuf, cbuf_iface, cbuf_device, rbuf); + sbuf_device, cbuf, cbuf_iface, cbuf_device, rbuf, hfi1_type); break; default: FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -402,7 +454,7 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, const uint32_t opcode, const int64_t psn_orig, @@ -423,20 +475,21 @@ size_t opx_hfi1_dput_write_header_and_payload( uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - return opx_hfi1_dput_write_packet(opx_ep, tx_hdr, tx_payload, NULL, + return opx_hfi1_dput_write_packet(opx_ep, hdr, tx_payload, NULL, opcode, psn_orig, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, sbuf_device, - cbuf, cbuf_iface, cbuf_device, rbuf); + cbuf, cbuf_iface, cbuf_device, rbuf, hfi1_type); } __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_iov(struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, struct iovec *iov, const uint32_t opcode, const uint16_t lrh_dws, @@ -452,19 +505,20 @@ size_t opx_hfi1_dput_write_header_and_iov(struct fi_opx_ep *opx_ep, uint64_t bytes_sent, uint8_t **sbuf, uint8_t **cbuf, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { /* When we're just setting the IOV * 1. Use a PSN of 0, because the caller will set that later * 2. The sbuf/cbuf iface and device are not used, so just pass in system/0 */ - return opx_hfi1_dput_write_packet(opx_ep, tx_hdr, NULL, iov, opcode, 0, + return opx_hfi1_dput_write_packet(opx_ep, hdr, NULL, iov, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, rma_request_vaddr, bytes_sent, sbuf, FI_HMEM_SYSTEM, 0ul, cbuf, FI_HMEM_SYSTEM, 0ul, - rbuf); + rbuf, hfi1_type); } #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h index 64e654c9dfe..a2d9f06b924 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h @@ -95,26 +95,33 @@ #define OPX_PBC_JKR_L2COMPRESSED_SHIFT 19 #define OPX_PBC_JKR_L2COMPRESSED_MASK 0x1 +/* The 16B ICRC/TAIL and pad qwords are necessary for PIO but the values are not used. + Use a poison value for pad for debug - it should not be in receive memory */ +#define OPX_JKR_16B_PAD_QWORD (uint64_t)0xDEAD00BEEF11DEAD + +/* 16B headers spill past the SOP cacheline by 1 qword. There's room for + payload in that 2nd non-SOP cacheline */ +#define OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS (FI_OPX_CACHE_LINE_QWS - 1) + /* Fields that unused on JKR (zero will be OR'd) */ #define OPX_PBC_JKR_UNUSED 0UL -#define OPX_PBC_JKR_DLID(_dlid) (((unsigned long long)(_dlid & OPX_PBC_JKR_DLID_MASK) << OPX_PBC_JKR_DLID_SHIFT) << OPX_PBC_MSB_SHIFT) -#define OPX_PBC_JKR_SCTXT(_ctx) (((unsigned long long)(_ctx & OPX_PBC_JKR_SCTXT_MASK) << OPX_PBC_JKR_SCTXT_SHIFT) << OPX_PBC_MSB_SHIFT) +#define OPX_PBC_JKR_DLID(_dlid) (((unsigned long long)(_dlid & OPX_PBC_JKR_DLID_MASK) << OPX_PBC_JKR_DLID_SHIFT) << OPX_MSB_SHIFT) +#define OPX_PBC_JKR_SCTXT(_ctx) (((unsigned long long)(_ctx & OPX_PBC_JKR_SCTXT_MASK) << OPX_PBC_JKR_SCTXT_SHIFT) << OPX_MSB_SHIFT) #define OPX_PBC_JKR_L2COMPRESSED(_c) OPX_PBC_JKR_UNUSED /* unused until 16B headers are optimized */ #define OPX_PBC_JKR_PORTIDX(_pidx) (((OPX_JKR_PHYS_PORT_TO_INDEX(_pidx)) & OPX_PBC_JKR_PORT_MASK) << OPX_PBC_JKR_PORT_SHIFT) #define OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(_dlid) OPX_PBC_JKR_DLID(htons(_dlid >> 16)) +#define OPX_PBC_JKR_INSERT_NON9B_ICRC (1<<24) #ifndef NDEBUG __OPX_FORCE_INLINE__ uint32_t opx_pbc_jkr_l2type(unsigned _type) { - /* 16B not supported yet */ - assert(_type == OPX_PBC_JKR_L2TYPE_9B); return (_type & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT; } #define OPX_PBC_JKR_L2TYPE(_type) opx_pbc_jkr_l2type(_type) #else -#define OPX_PBC_JKR_L2TYPE(_type) ((OPX_PBC_JKR_L2TYPE_9B & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT) /* 16B not supported yet */ +#define OPX_PBC_JKR_L2TYPE(_type) ((_type & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT) #endif #define OPX_PBC_JKR_RUNTIME(_dlid, _pidx) OPX_PBC_JKR_UNUSED @@ -187,6 +194,18 @@ static inline int opx_bth_rc2_val() #define OPX_BTH_JKR_RC2(_rc2) ((_rc2 & OPX_BTH_JKR_RC2_MASK) << OPX_BTH_JKR_RC2_SHIFT) #define OPX_BTH_JKR_RC2_VAL opx_bth_rc2_val() + +/* LRH */ +#define OPX_LRH_JKR_16B_DLID_MASK_16B 0x0FFFFF +#define OPX_LRH_JKR_16B_DLID_SHIFT_16B OPX_MSB_SHIFT + +#define OPX_LRH_JKR_16B_DLID20_MASK_16B 0xF00000 +#define OPX_LRH_JKR_16B_DLID20_SHIFT_16B (20 - 12) // shift right 20 (dlid bits) and left 12 (lrh bits) + +#define OPX_LRH_JKR_16B_RX_MASK_16B 0xFF +#define OPX_LRH_JKR_16B_RX_SHIFT_16B (7*8) // 7 bytes + + /* RHF */ /* JKR * @@ -204,8 +223,8 @@ static inline int opx_bth_rc2_val() #define OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf) (_seq != (_rhf & 0x0F00000000000000ul)) #define OPX_JKR_RHF_SEQ_INCREMENT(_seq) ((_seq < 0x0D00000000000000ul) * _seq + 0x0100000000000000ul) -#define OPX_JKR_IS_ERRORED_RHF(_rhf) (_rhf & 0x8000000000000000ul) -#define OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf) (_seq == (_rhf & 0x0F00000000000000ul)) +#define OPX_JKR_IS_ERRORED_RHF(_rhf, _hfi1_type) (_rhf & 0x8000000000000000ul) /* does not check RHF.KHdrLenErr */ +#define OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) (_seq == (_rhf & 0x0F00000000000000ul)) #define OPX_JKR_RHF_SEQ_INIT_VAL (0x0100000000000000ul) #define OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf) ((_rhf & 0x00008000ul) == 0x00008000ul) @@ -238,10 +257,11 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -#define OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - opx_jkr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) +#define OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + opx_jkr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) // Common to both JKR/WFR @@ -250,22 +270,122 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, #define OPX_JKR_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) /* Common (jkr) handler to WFR/JKR 9B (for now) */ -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr); +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -__OPX_FORCE_INLINE__ int opx_jkr_rhf_check_header(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +__OPX_FORCE_INLINE__ int opx_jkr_9B_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { /* RHF error */ - if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd))) return 1; /* error */ + if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_JKR))) return 1; /* error */ /* Bad packet header */ if (OFI_UNLIKELY((!OPX_JKR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && - (ntohs(hdr->stl.lrh.pktlen) > 0x15) && + (ntohs(hdr->lrh_9B.pktlen) > 0x15) && !(OPX_JKR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) - return opx_jkr_rhf_error_handler(rhf_rcvd, hdr); /* error */ + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ else return 0; /* no error*/ } -#define OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) opx_jkr_rhf_check_header(_rhf_rcvd, _hdr) +__OPX_FORCE_INLINE__ int opx_jkr_16B_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) +{ + /* RHF error */ + if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_JKR))) return 1; /* error */ + + /* Bad packet header */ + if (OFI_UNLIKELY((!OPX_JKR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && + (hdr->lrh_16B.pktlen > 0x9) && + !(OPX_JKR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ + else + return 0; /* no error*/ +} + +#define OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR_9B) ? \ + opx_jkr_9B_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type) : opx_jkr_16B_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type)) + +union opx_jkr_pbc{ + uint64_t raw64b; + uint32_t raw32b[2]; + + __le64 qw; + __le32 dw[2]; + __le16 w[1]; + + struct { + __le64 LengthDWs:12; + __le64 Vl:4; + __le64 PortIdx:2; + __le64 Reserved_2:1; + __le64 L2Compressed:1; + __le64 L2Type:2; + __le64 Fecnd:1; + __le64 TestBadLcrc:1; + __le64 InsertNon9bIcrc:1; + __le64 CreditReturn:1; + __le64 InsertHcrc:2; + __le64 Reserved_1:1; + __le64 TestEbp:1; + __le64 Sc4:1; + __le64 Intr:1; + __le64 Dlid: 24; + __le64 SendCtxt: 8; + }; + +}; + +#ifndef NDEBUG + #define OPX_PRINT_RHF(a) opx_print_rhf((opx_jkr_rhf)(a),__func__,__LINE__) +#else + #define OPX_PRINT_RHF(a) +#endif + +union opx_jkr_rhf { + uint64_t qw; + uint32_t dw[2]; + uint16_t w[4]; + struct { + uint64_t PktLen:12; + uint64_t RcvType:3; + uint64_t UseEgrBfr:1; + uint64_t EgrIndex:14; + uint64_t Rsvd:1; + uint64_t KHdrLenErr:1; + uint64_t EgrOffset:12; + uint64_t HdrqOffset:9; + uint64_t L2Type9bSc4:1; + uint64_t L2Type:2; + uint64_t RcvSeq:4; + uint64_t RcvPort:2; + uint64_t SendPacing:1; + uint64_t RheValid:1; + }; +}; + + +static inline void opx_print_rhf(union opx_jkr_rhf rhf, const char* func, const unsigned line) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: %s \n", func, line, __func__); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RheValid = %#x\n", func, line, rhf.RheValid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.SendPacing = %#x\n", func, line, rhf.SendPacing); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvPort = %#x\n", func, line, rhf.RcvPort); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvSeq = %#x\n", func, line, rhf.RcvSeq); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s:%u: RHF.L2Type = %#x%s\n", func, line, rhf.L2Type, + (rhf.L2Type == 0x3 ? " 9B": + (rhf.L2Type == 0x2 ? " 16B": + (rhf.L2Type == 0x1 ? " 10B": + (rhf.L2Type == 0x0 ? " 8B":" INVALID"))))); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.L2Type9bSc4 = %#x\n", func, line, rhf.L2Type9bSc4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.HdrqOffset = %#x\n", func, line, rhf.HdrqOffset); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.EgrOffset = %#x\n", func, line, rhf.EgrOffset); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.Rsvd = %#x\n", func, line, rhf.Rsvd); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.KHdrLenErr = %#x\n", func, line, rhf.KHdrLenErr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.EgrIndex = %#x\n", func, line, rhf.EgrIndex); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.UseEgrBfr = %#x\n", func, line, rhf.UseEgrBfr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvType = %#x\n", func, line, rhf.RcvType); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.PktLen = %#x\n", func, line, rhf.PktLen); +} #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 87a5a65d9f5..f7d8ef70fb2 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -53,6 +53,7 @@ #define FI_OPX_ADDR_SEP_RX_MAX (4) #define FI_OPX_HFI1_PACKET_MTU (8192) #define OPX_HFI1_TID_PAGESIZE (PAGE_SIZE) /* assume 4K, no hugepages*/ + #define FI_OPX_HFI1_PACKET_IMM (16) /* opcodes (0x00..0xBF) are reserved */ @@ -63,17 +64,38 @@ #define FI_OPX_HFI_BTH_OPCODE_ATOMIC (0xC4) #define FI_OPX_HFI_BTH_OPCODE_ACK (0xC5) #define FI_OPX_HFI_BTH_OPCODE_UD (0xC6) /* unreliabile datagram */ -/* opcodes (0xC7..0xEF) are unused */ -#define FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH (0xF7) -#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT (0xF8) -#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER (0xF9) -#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST (0xFA) -#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS (0xFB) -#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT (0xFC) -#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER (0xFD) -#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST (0xFE) -#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS (0xFF) - +/* opcodes (0xC7..0xEE) are unused */ + +#define FI_OPX_HFI_BTH_OPCODE_CQ_BIT (0x01) +#define FI_OPX_HFI_BTH_OPCODE_TAG_BIT (0x02) +#define FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) \ + (opcode & ~(FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT)) +#define FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) \ + (opcode & ~(FI_OPX_HFI_BTH_OPCODE_CQ_BIT)) +#define FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_CQ_BIT) ? FI_REMOTE_CQ_DATA : 0) +#define FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT) ? FI_TAGGED : FI_MSG) +#define FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT) ? 1 : 0) + +#define FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH (0xEF) +#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT (0xF0) +#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER (0xF4) +#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST (0xF8) +#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS (0xFC) +#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) static const char* FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[] = { /* opcodes (0x00..0xBF) are reserved */ @@ -86,17 +108,46 @@ static const char* FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[] = { "FI_OPX_HFI_BTH_OPCODE_UD " }; static const char* FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[] = { - /* opcodes (0xC7..0xEF) are unused */ - "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH ", - "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ", - "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ", - "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ", - "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ", - "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ", - "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ", - "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ", - "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ", - "INVALID BTH OPCODE " }; + /* opcodes (0xC7..0xEE) are unused */ + "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH ", + "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ", + "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ", + "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ", + "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ", + "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ", + "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ", + "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ", + "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ", + "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ ", + "INVALID BTH OPCODE " }; + +OPX_COMPILE_TIME_ASSERT((FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH == (FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ - sizeof(FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS)/sizeof(char*) + 2)), "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH must be first in the high opcode array, or dependent code conditionals need updated"); +OPX_COMPILE_TIME_ASSERT((FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ == 0xFF), "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must be last in the high opcode array, or dependent code conditionals need updated"); + +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT and FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER and FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); + +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); static inline const char* opx_hfi1_bth_opcode_to_string(uint16_t opcode) { @@ -105,24 +156,24 @@ static inline const char* opx_hfi1_bth_opcode_to_string(uint16_t opcode) (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_UD)) { return FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[opcode-FI_OPX_HFI_BTH_OPCODE_INVALID]; } else if ((opcode >= (uint16_t) FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) && - (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)) { + (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ)) { return FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[opcode-FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH]; } return FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[sizeof(FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS)/sizeof(char*)-1]; /* INVALID */ } -#define FI_OPX_HFI1_PACKET_SLID(packet_hdr) \ - (((packet_hdr).qw[0] & 0xFFFF000000000000ul) >> 48) -#define FI_OPX_HFI1_PACKET_PSN(packet_hdr) \ - (((packet_hdr)->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ - ? ntohl((packet_hdr)->stl.bth.psn) & 0x00FFFFFF \ - : (packet_hdr)->reliability.psn) -#define FI_OPX_HFI1_PACKET_ORIGIN_TX(packet_hdr) \ - (((packet_hdr)->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ - ? (packet_hdr)->dput.target.origin_tx \ + +#define FI_OPX_HFI1_PACKET_ORIGIN_TX(packet_hdr) \ + (((packet_hdr)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ + ? (packet_hdr)->dput.target.origin_tx \ : (packet_hdr)->reliability.origin_tx) +#define FI_OPX_HFI1_PACKET_PSN(packet_hdr) \ + (((packet_hdr)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ + ? ntohl((packet_hdr)->bth.psn) & 0x00FFFFFF \ + : (packet_hdr)->reliability.psn) + #define FI_OPX_HFI_UD_OPCODE_FIRST_INVALID (0x00) #define FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING (0x01) #define FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK (0x02) @@ -214,7 +265,9 @@ static inline const char* opx_hfi1_dput_opcode_to_string(uint8_t opcode) #define HFI_KHDR_KVER_SHIFT 30 #define HFI_KHDR_KVER_MASK 0x3 -struct fi_opx_hfi1_stl_packet_hdr { + +/* "Legacy" header with 9DWs of KDETH */ +struct fi_opx_hfi1_stl_packet_hdr_9B { /* == quadword 0 == */ union { @@ -226,8 +279,8 @@ struct fi_opx_hfi1_stl_packet_hdr { uint16_t dlid; /* lrh.w[1] - big-endian! */ uint16_t pktlen; /* lrh.w[2] - big-endian! */ uint16_t slid; /* lrh.w[3] - big-endian! */ - } __attribute__((packed)); - } lrh; + } __attribute__((__packed__)); + } lrh_9B; /* == quadword 1 == */ union { @@ -245,7 +298,7 @@ struct fi_opx_hfi1_stl_packet_hdr { /* == quadword 2 == */ uint32_t psn; /* bth.dw[2] ..... the 'psn' field is unused for 'eager' packets -----> reliability::psn, etc */ - } __attribute__((packed)); + } __attribute__((__packed__)); } bth; union { @@ -259,11 +312,87 @@ struct fi_opx_hfi1_stl_packet_hdr { uint16_t jkey; /* kdeth.w[2] */ uint16_t hcrc; /* kdeth.w[3] */ uint32_t unused; /* kdeth.dw[2] -----> immediate data (32b) */ - } __attribute__((packed)); + } __attribute__((__packed__)); } kdeth; /* == quadword 4,5,6 == */ - uint64_t unused[3]; + uint64_t kdeth_sw[3]; + +} __attribute__((__packed__)); + +/* "Legacy" header with 9DWs of KDETH */ +struct fi_opx_hfi1_stl_packet_hdr_16B { + + /* == quadword 0,1 == */ + union { + struct { + __le64 qw0; + __le64 qw1; + }; + __le64 qw[2]; + __le32 dw[4]; + __le16 w[8]; + struct { /* 16B header */ + __le32 slid:20; /* dw[0] qw[0]*/ + /* This is the packet length and is in units of flits (QWs) for 8B, 10B and 16B + formats, but in units of DWs for 9B formats.*/ + __le32 pktlen:11; + __le32 b:1; + + __le32 dlid:20; /* dw[1] */ + __le32 sc:5; + __le32 rc:3; + __le32 f:1; + __le32 l2:2; + __le32 lt:1; + + __le32 l4:8; /* dw[2] qw[1] */ + __le32 slid20:4; + __le32 dlid20:4; + __le32 pkey:16; + + __le32 entropy:16; /* dw[3] */ + __le32 age:3; + __le32 cspec:5; + __le32 r:8; + }; + }lrh_16B; + + /* == quadword 2 == */ + union { + uint32_t dw[3]; + uint16_t w[6]; + uint8_t hw[12]; + struct { + uint8_t opcode; /* bth.hw[0] */ + uint8_t bth_1; /* bth.hw[1] */ + uint16_t pkey; /* bth.w[1] - big-endian! */ + uint8_t ecn; /* bth.hw[4] (FECN, BECN, (CSPEC and RC2 for JKR) and reserved) */ + uint8_t qp; /* bth.hw[5] */ + uint8_t unused; /* bth.hw[6] -----> inject::message_length, send::xfer_bytes_tail */ + uint8_t rx; /* bth.hw[7] */ + + /* == quadword 3 == */ + uint32_t psn; /* bth.dw[2] ..... the 'psn' field is unused for 'eager' packets -----> reliability::psn, etc */ + } __attribute__((__packed__)); + } bth; + + union { + uint32_t dw[3]; + uint16_t w[6]; + uint8_t hw[12]; + struct { + uint32_t offset_ver_tid; /* kdeth.dw[0] .... the 'offset' field is unused for 'eager' packets */ + + /* == quadword 4 == */ + uint16_t jkey; /* kdeth.w[2] */ + uint16_t hcrc; /* kdeth.w[3] */ + uint32_t unused; /* kdeth.dw[2] -----> immediate data (32b) */ + } __attribute__((__packed__)); + } kdeth; + + /* == quadword 5,6,7 == */ + uint64_t kdeth_sw[3]; } __attribute__((__packed__)); @@ -337,9 +466,10 @@ struct fi_opx_hfi1_stl_packet_hdr { #define FI_OPX_PKT_RZV_FLAGS_NONCONTIG (1ul) #define FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK (FI_OPX_PKT_RZV_FLAGS_NONCONTIG << FI_OPX_PKT_RZV_FLAGS_SHIFT) +#if 0 #ifndef NDEBUG static inline -void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, +void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr_9B * hdr, const char * fn, const unsigned ln) { #if __GNUC__ > 9 @@ -351,10 +481,10 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, #endif fprintf(stderr, "%s():%u ==== dump stl packet header @ %p [%016lx %016lx %016lx %016lx]\n", fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); - fprintf(stderr, "%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh.flags); - fprintf(stderr, "%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.dlid, hdr->lrh.dlid, ntohs(hdr->lrh.dlid)); - fprintf(stderr, "%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.pktlen, hdr->lrh.pktlen, ntohs(hdr->lrh.pktlen)); - fprintf(stderr, "%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.slid, hdr->lrh.slid, ntohs(hdr->lrh.slid)); + fprintf(stderr, "%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh_9B.flags); + fprintf(stderr, "%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh.dlid)); + fprintf(stderr, "%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh.pktlen)); + fprintf(stderr, "%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh.slid)); fprintf(stderr, "%s():%u\n", fn, ln); fprintf(stderr, "%s():%u .bth.opcode ............ 0x%02x \n", fn, ln, hdr->bth.opcode); fprintf(stderr, "%s():%u .bth.bth_1 ............. 0x%02x \n", fn, ln, hdr->bth.bth_1); @@ -374,7 +504,7 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, return; } #endif - +#endif /** @@ -383,17 +513,17 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, * The HFI1 packet header is consumed in many places and sometimes overloaded * for cache and memory allocation reasons. */ -union fi_opx_hfi1_packet_hdr { +union fi_opx_hfi1_packet_hdr_9B { uint64_t qw[7]; - struct fi_opx_hfi1_stl_packet_hdr stl; + struct fi_opx_hfi1_stl_packet_hdr_9B stl; struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; + uint16_t _slid; /* == quadword 1 == */ uint64_t reserved_1; @@ -413,7 +543,7 @@ union fi_opx_hfi1_packet_hdr { struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; /* used for FI_DIRECTED_RECV; identifies the node - big-endian! */ + uint16_t _slid; /* used for FI_DIRECTED_RECV; identifies the node - big-endian! */ /* == quadword 1 == */ uint64_t reserved_1; @@ -724,7 +854,7 @@ union fi_opx_hfi1_packet_hdr { struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; /* stl.lrh.slid */ + uint16_t _slid; /* stl.lrh.slid */ /* == quadword 1 == */ uint64_t reserved_1; @@ -746,16 +876,482 @@ union fi_opx_hfi1_packet_hdr { } __attribute__((__packed__)) service; /* "reliability service" */ } __attribute__((__aligned__(8))); -static_assert(((offsetof(union fi_opx_hfi1_packet_hdr, rendezvous.flags) % 8) * 8) == FI_OPX_PKT_RZV_FLAGS_SHIFT, - "struct fi_opx_hfi1_packet_hdr.rendezvous.flags offset inconsistent with FLAGS_SHIFT!"); -static inline -fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union fi_opx_hfi1_packet_hdr * const hdr) { +static_assert(((offsetof(union fi_opx_hfi1_packet_hdr_9B, rendezvous.flags) % 8) * 8) == FI_OPX_PKT_RZV_FLAGS_SHIFT, + "struct opx_hfi1_packet_hdr.rendezvous.flags offset inconsistent with FLAGS_SHIFT!"); + + + +/* + HEADER UNION RX POLL + ===================== ============= + 9B 16B 9B 16B + ========= ========== ===== ===== +QW[0] (pad) LRH <-- | <-- RX header union pointer +QW[1] LRH LRH | | +QW[2] BTH BTH | | <- RX common OPX headers +QW[3] BTH/KDETH BTH/KDETH | | +QW[4] KDETH KDETH | | +QW[5] USER/SW USER/SW | | +QW[6] USER/SW USER/SW | | +QW[7] USER/SW USER/SW | | + RHF RHF + + (*) HDRQ entries are 128 bytes (16 quadwords) and include HEADER + RHF + + In RX POLL, pull SLID, DLID and PKTLEN out of 9B/16B LRH. + All other RX stack can use the common OPX headers to access OPX fields. +*/ + +/** + * \brief Converged HFI1 packet header for 9B & 16B (JKR) + * + * The HFI1 packet header is consumed in many places and sometimes overloaded + * for cache and memory allocation reasons. + */ +union opx_hfi1_packet_hdr { + /* STL UNION */ + union opx_hfi1_stl_packet_hdr { + struct { + uint64_t qw0; + struct fi_opx_hfi1_stl_packet_hdr_9B hdr_9B; /* 9B legacy w/ 9 DW KDETH */ + uint64_t qwn[7]; /* 9B(+) QW's */ + } __attribute__((__packed__)) fi_opx_hfi1_stl_packet_hdr_9BP; /* 9B(+14 DWs of KDETH ) */ + struct { + struct fi_opx_hfi1_stl_packet_hdr_16B hdr_16B; /* 16B legacy w/ 9 DW KDETH */ + uint64_t qwn[7]; /* 16B(+) QW's */ + } __attribute__((__packed__)) fi_opx_hfi1_stl_packet_hdr_16BP; /* 16B(+14 DWs of KDETH */ + } __attribute__((__packed__)) stl; /* for alignment/sizes*/ + + /* QUADWORD UNION */ + struct { + uint64_t unused_pad_9B; + uint64_t qw_9B[7]; /* 9B QW's */ + uint64_t qw_9BP[7]; /* 9B(+) QW's */ + }; + uint64_t qw_16B[15]; /* 16B QW's */ + + /* Standard (new) Headers - LRH, BTH, KDETH, SW defined (KDETH) + 15 quadwords */ + struct { + /* LRH union for (padded) 9B and 16B LRH */ + union { + struct { + uint64_t unused_pad_qw0; + union { + uint64_t qw[1]; /* 9B LRH is 1 quadword */ + + struct { /* 9B LRH */ + uint16_t flags; + uint16_t dlid; + uint16_t pktlen; + uint16_t slid; + } __attribute__((__packed__)); + } lrh_9B; + }; + union { + __le64 qw[2]; /* 16B is 2 quadwords */ + + struct { /* 16B LRH */ + __le32 slid:20; + /* This is the packet length and is in units of flits (QWs) for 8B, 10B and 16B + formats, but in units of DWs for 9B formats.*/ + __le32 pktlen:11; + __le32 b:1; + + __le32 dlid:20; + __le32 sc:5; + __le32 rc:3; + __le32 f:1; + __le32 l2:2; + __le32 lt:1; + + __le32 l4:8; + __le32 slid20:4; + __le32 dlid20:4; + __le32 pkey:16; + + __le32 entropy:16; + __le32 age:3; + __le32 cspec:5; + __le32 r:8; + }; + } lrh_16B; + } ; + + /* QW[2-3] BTH 1 1/2 quadwords, 3 dwords */ + struct { + uint8_t opcode; + uint8_t bth_1; + uint16_t pkey; + uint8_t ecn; /* (FECN, BECN, (CSPEC and RC2 for JKR) and reserved) */ + uint8_t qp; + uint8_t unused; + uint8_t rx; + + /* QW[3] starts */ + uint32_t psn; + } __attribute__((__packed__)) bth; + + /* QW[3-4] KDETH 1 1/2 quadwords, 3 dwords */ + struct { + uint32_t offset_ver_tid; + + /* QW[4] starts */ + uint16_t jkey; + uint16_t hcrc; + uint32_t unused; + } __attribute__((__packed__)) kdeth; + + /* QW[5-7] 9B SW defined */ + /* QW[8-14] 9B(+) SW defined */ + /* QW[5-14] 16B SW defined */ + uint64_t sw_defined[10]; + } __attribute__((__packed__)); + + + /* OPX headers + * + * overlay/redefine some standard header fields + * and the SW defined header */ + + + /* OPX RELIABILITY HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn,offset_ver_tid)*/ + uint32_t psn : 24; + uint32_t origin_tx : 8; + uint8_t unused; /* WHY? unused but zeroed in model */ + uint8_t reserved_1[3]; + + uint64_t reserved_n[10]; /* QW[4-14] KDETH/SW */ + + } __attribute__((__packed__)) reliability; + + + /* OPX MATCH HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn) */ + uint8_t reserved_0[3]; + uint8_t origin_tx; /* used for FI_DIRECTED_RECV; identifies the endpoint on the node */ + uint32_t reserved_1; + + /* QW[4] KDETH (unused) */ + uint32_t reserved_2; + uint32_t ofi_data; /* used for FI_RX_CQ_DATA */ + + uint64_t reserved_3[2]; /* QW[5-6] SW */ + + uint64_t ofi_tag; /* QW[7] SW last 9B quadword */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) match; + + + /* OPX INJECT HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t message_length; /* only need 5 bits; maximum inject message size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH*/ + uint64_t reserved_3[2]; + + /* QW[5-6] SW */ + union { + uint8_t app_data_u8[16]; + uint16_t app_data_u16[8]; + uint32_t app_data_u32[4]; + uint64_t app_data_u64[2]; + }; + + uint64_t reserved_n[8]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) inject; + + + /* OPX SEND HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* only need 4 bits; maximum tail size is 8 bytes (or is it 7?) */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH*/ + uint64_t reserved_3[2]; + + /* QW[5] SW */ + uint16_t unused[3]; + uint16_t payload_qws_total; /* TODO - use stl.lrh.pktlen instead (num dws); only need 11 bits; maximum number of payload qw is 10240 / 8 = 1280 */ + + /* QW[6] SW */ + uint64_t xfer_tail; + + uint64_t reserved_n[8]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) send; + + /* OPX MP EAGER 1ST HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* Maximum tail size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3] BTH/KDETH (offset_ver_tid) */ + uint32_t reserved_3; + uint32_t payload_bytes_total; /* Total length of payload across all mp-eager packets */ + + /* QW[4] KDETH */ + uint64_t reserved_4; + + /* QW[5-6] SW */ + uint64_t xfer_tail[2]; + + uint64_t reserved_n[8]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) mp_eager_first; + + /* OPX MP EAGER NTH HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* Maximum tail size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5-6] SW */ + uint64_t xfer_tail[2]; + + /* QW[7] SW last 9B quadword */ + uint32_t payload_offset; + uint32_t mp_egr_uid; + + uint64_t reserved_n[7]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) mp_eager_nth; + + /* OPX RENDEZVOUS HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5] SW */ + uint16_t origin_rs; + uint8_t flags; + uint8_t unused[3]; + uint16_t niov; /* number of non-contiguous buffers */ + + /* QW[6] SW */ + uint64_t message_length; /* total length in bytes of all non-contiguous buffers and immediate data */ + + uint64_t reserved_n[8]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) rendezvous; + + /* OPX CTS HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5-7] SW */ + union { + uint8_t opcode; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint16_t ntidpairs; /* number of tidpairs described in the packet payload */ + uint16_t niov; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t origin_byte_counter_vaddr; + uintptr_t target_context_vaddr; + } vaddr; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint8_t dt; + uint8_t op; + uint16_t niov; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t rma_request_vaddr; + uint64_t key; + } mr; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint8_t unused2; + uint8_t unused3; + uint16_t unused4; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t completion_counter; + uint64_t bytes_to_fence; + } fence; + } target; + + uint64_t reserved_n[7]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) cts; + + /* OPX DPUT HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_o2; + + /* QW[3] BTH/KDETH */ + uint64_t reserved_3; + + /* QW[4] KDETH/SW */ + uint64_t reserved_4; + + /* QW[5,6,7] KDETH/SW */ + union { + /* Common fields */ + struct { + /* QW[5] KDETH/SW */ + uint8_t opcode; + uint8_t origin_tx; + uint8_t dt; + uint8_t op; + uint16_t last_bytes; + uint16_t bytes; + + /* QW[6,7] SW */ + uint64_t reserved[2]; /* op-specific */ + }; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6] SW */ + uintptr_t rma_request_vaddr; + /* QW[7] SW */ + uintptr_t rbuf; + } get; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6] SW */ + uintptr_t completion_vaddr; /* struct fi_opx_rzv_completion * */ + /* QW[7] SW */ + uintptr_t rbuf; + } rzv; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6,7] SW */ + uintptr_t key; + uintptr_t offset; + } mr; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6,7] SW */ + uintptr_t completion_counter; + uint64_t bytes_to_fence; + } fence; + } target; + + uint64_t reserved_n[7]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) dput; + + /* OPX UD HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t opcode; + uint8_t reserved_2; + + uint64_t reserved_n[12]; /* QW[3-14] SW */ + + } __attribute__((__packed__)) ud; + + /* OPX SERVICE HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn,offset_ver_tid) */ + uint32_t range_count; + uint8_t origin_reliability_rx; + uint8_t reserved_1[3]; + + /* QW[4] KDETH (unused) */ + uint32_t reserved_2; + uint32_t unused; /* WHY? unused but zeroed in model */ + + /* QW[5-7] SW */ + uint64_t psn_count; + uint64_t psn_start; + uint64_t key; /* fi_opx_reliability_service_flow_key */ + + uint64_t reserved_n[7]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) service; /* "reliability service" */ +} __attribute__((__packed__)) __attribute__((__aligned__(8))); + +static_assert(sizeof(union opx_hfi1_packet_hdr) == sizeof(uint64_t[15]), + "sizeof(union opx_hfi1_packet_hdr) must be 15 qwords!"); + + +static inline +fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union opx_hfi1_packet_hdr * const hdr, + const uint64_t slid) { const union fi_opx_uid uid = { .endpoint_id = hdr->reliability.origin_tx, /* node-scoped endpoint id */ - .lid = hdr->match.slid /* job-scoped node id */ + .lid_3B = 0, + .lid = slid /* job-scoped node id */ }; return uid.fi; @@ -763,31 +1359,39 @@ fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union fi_opx_hfi1_packet_hdr * co static inline size_t -fi_opx_hfi1_packet_hdr_message_length (const union fi_opx_hfi1_packet_hdr * const hdr) +fi_opx_hfi1_packet_hdr_message_length (const union opx_hfi1_packet_hdr * const hdr) { size_t message_length = 0; - switch (hdr->stl.bth.opcode) { + switch (hdr->bth.opcode) { case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: message_length = hdr->inject.message_length; break; case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: message_length = hdr->send.xfer_bytes_tail + hdr->send.payload_qws_total * sizeof(uint64_t); break; case FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST: case FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST: + case FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ: message_length = hdr->mp_eager_first.payload_bytes_total & FI_OPX_HFI1_KDETH_VERSION_OFF_MASK; break; case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: //assert(hdr->rendezvous.niov == 1); message_length = hdr->rendezvous.message_length; break; default: fprintf(stderr, "%s:%s():%d abort. hdr->stl.bth.opcode = %02x (%u)\n", - __FILE__, __func__, __LINE__, hdr->stl.bth.opcode, - hdr->stl.bth.opcode); + __FILE__, __func__, __LINE__, hdr->bth.opcode, + hdr->bth.opcode); abort(); break; } @@ -795,6 +1399,202 @@ fi_opx_hfi1_packet_hdr_message_length (const union fi_opx_hfi1_packet_hdr * cons return message_length; } +#ifndef NDEBUG + +#define OPX_JKR_PRINT_16B_PBC(a) opx_jkr_print_16B_pbc((a),__func__) +#define OPX_JKR_PRINT_16B_LRH(a,b) opx_jkr_print_16B_lrh((a),(b),__func__) +#define OPX_JKR_PRINT_16B_BTH(a,b) opx_jkr_print_16B_bth((a),(b),__func__) + +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); + + +static inline +void fi_opx_hfi1_dump_stl_packet_hdr (const union opx_hfi1_packet_hdr * hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + +#if __GNUC__ > 9 +#pragma GCC diagnostic ignored "=Waddress-of-packed-member" +#endif + const uint64_t * const qw = (uint64_t *)hdr; +#if __GNUC__ > 9 +#pragma GCC diagnostic pop +#endif + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u ==== dump stl packet header @ %p [%016lx %016lx %016lx %016lx]\n", fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh_9B.flags); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh_9B.dlid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh_9B.pktlen)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh_9B.slid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.opcode ............ 0x%02x \n", fn, ln, hdr->bth.opcode); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.bth_1 ............. 0x%02x \n", fn, ln, hdr->bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.pkey .............. 0x%04hx \n", fn, ln, hdr->bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.ecn ............... 0x%02x \n", fn, ln, hdr->bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.qp ................ 0x%02x \n", fn, ln, hdr->bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.unused ............ 0x%02x \n", fn, ln, hdr->bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.rx ................ 0x%02x \n", fn, ln, hdr->bth.rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.psn ............... 0x%08x \n", fn, ln, hdr->bth.psn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.offset_ver_tid .. 0x%08x\n", fn, ln, hdr->kdeth.offset_ver_tid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.jkey ............ 0x%04hx\n", fn, ln, hdr->kdeth.jkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.hcrc ............ 0x%04hx\n", fn, ln, hdr->kdeth.hcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.unused .......... 0x%08x\n", fn, ln, hdr->kdeth.unused); + + return; +} + +static inline +void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + const uint64_t * const qw = (uint64_t *)hdr; + const pid_t pid = getpid(); + //fi_opx_hfi1_dump_stl_packet_hdr (hdr, hfi1_type, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ==== dump packet header @ %p [%016lx %016lx %016lx %016lx]\n", pid, fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.flags ........... 0x%04hx\n", pid, fn, ln, hdr->lrh_9B.flags); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.dlid ............ 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh_9B.dlid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.pktlen .......... 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh_9B.pktlen)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.slid ............ 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh_9B.slid)); + } else { + OPX_JKR_PRINT_16B_LRH(hdr->qw_16B[0], hdr->qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(hdr->qw_16B[2], hdr->qw_16B[2]); + } + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .stl.bth.opcode ...... 0x%02x (%s)\n", pid, fn, ln, + hdr->bth.opcode, opx_hfi1_bth_opcode_to_string((uint16_t)hdr->bth.opcode)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.bth_1 .......... 0x%02x \n", pid, fn, ln, hdr->bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.pkey .......... 0x%04hx\n", pid, fn, ln, hdr->bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.ecn .......... 0x%02x \n", pid, fn, ln, hdr->bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.qp .......... 0x%02x \n", pid, fn, ln, hdr->bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.unused .......... 0x%02x \n", pid, fn, ln, hdr->bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.rx .......... 0x%02x \n", pid, fn, ln, hdr->bth.rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.psn .......... 0x%08x \n", pid, fn, ln, hdr->bth.psn); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .kdeth.offset_ver_tid. 0x%08x \n", pid, fn, ln, hdr->kdeth.offset_ver_tid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .jkey .......... 0x%04hx \n", pid, fn, ln, hdr->kdeth.jkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .hcrc .......... 0x%04hx \n", pid, fn, ln, hdr->kdeth.hcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .unused .......... 0x%08x \n", pid, fn, ln, hdr->kdeth.unused); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ofi_tag, last 9B QW... 0x%16.16lx\n", pid, fn, ln, hdr->qw_9B[6]); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.origin_tx ..... 0x%02x \n", pid, fn, ln, hdr->match.origin_tx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.ofi_data ...... 0x%08x \n", pid, fn, ln, hdr->match.ofi_data); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.ofi_tag ....... 0x%016lx \n", pid, fn, ln, hdr->match.ofi_tag); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.message_length 0x%04x \n", pid, fn, ln, hdr->inject.message_length); + + switch (hdr->bth.opcode) { + case FI_OPX_HFI_BTH_OPCODE_UD: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .ud.opcode ... 0x%02x (%s) \n", pid, fn, ln, + hdr->ud.opcode, opx_hfi1_ud_opcode_to_string(hdr->ud.opcode)); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.message_length ... 0x%02x \n", pid, fn, ln, hdr->inject.message_length); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[0] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[0]); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[1] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[1]); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_bytes_tail .... 0x%02x \n", pid, fn, ln, hdr->send.xfer_bytes_tail); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.payload_qws_total .. 0x%04x \n", pid, fn, ln, hdr->send.payload_qws_total); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_tail .......... 0x%016lx \n", pid, fn, ln, hdr->send.xfer_tail); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_RZV_CTS: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.origin .......... 0x%x \n", pid, fn, ln, hdr->cts.origin_rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.target.vaddr.ntidpairs .......... 0x%x \n", pid, fn, ln, hdr->cts.target.vaddr.ntidpairs); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.target.opcode .......... 0x%x (%s) \n", pid, fn, ln, + hdr->cts.target.opcode, opx_hfi1_dput_opcode_to_string(hdr->cts.target.opcode)); + break; + default: + break; + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ==== QWs 4-7 : [%016lx %016lx %016lx %016lx]\n", pid, fn, ln, qw[4], qw[5], qw[6], qw[7]); + + return; +} + +#else +// Disable the macros +#define OPX_JKR_PRINT_16B_PBC(a) +#define OPX_JKR_PRINT_16B_LRH(a,b) +#define OPX_JKR_PRINT_16B_BTH(a,b) + +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); + +static inline +void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + return; +} + +#endif + +#ifdef OPX_JKR_DEBUG +#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + __hdr->qw_16B[1]); \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + __hdr->qw_16B[3]); \ + } else { \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + __func__, __LINE__);\ + } + +#define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_PBC(__pbc); \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + __hdr->qw_16B[1]); \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + __hdr->qw_16B[3]); \ + } else { \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + __func__, __LINE__);\ + } + +#define OPX_DEBUG_PRINT_PBC_HDR_QW(q0,q1,q2,q3,q4,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_PBC(q0); \ + OPX_JKR_PRINT_16B_LRH(q1,q2); \ + OPX_JKR_PRINT_16B_BTH(q3,q4); \ + } + +#else + +#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) +#define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) +#define OPX_DEBUG_PRINT_PBC_HDR_QW(q0,q1,q2,q3,q4,__hfi1_type) + +#endif + + union cacheline { uint64_t qw[8]; uint32_t dw[16]; @@ -852,46 +1652,82 @@ struct fi_opx_hmem_iov { - (4 * sizeof(uint32_t))) \ / sizeof(uint32_t)) +#define OPX_IMMEDIATE_BYTE_COUNT_SHIFT (5) +#define OPX_IMMEDIATE_BYTE_COUNT_MASK (0xE0) +#define OPX_IMMEDIATE_QW_COUNT_SHIFT (2) +#define OPX_IMMEDIATE_QW_COUNT_MASK (0x1C) +#define OPX_IMMEDIATE_BLOCK_SHIFT (1) +#define OPX_IMMEDIATE_BLOCK_MASK (0x02) +#define OPX_IMMEDIATE_TAIL_SHIFT (0) +#define OPX_IMMEDIATE_TAIL_MASK (0x01) +#define OPX_IMMEDIATE_TAIL_BYTE_COUNT (7) + union fi_opx_hfi1_rzv_rts_immediate_info { uint64_t qw0; struct { - uint8_t byte_count; /* only need 3 bits (0..7 bytes) */ - uint8_t qw_count; /* only need 3 bits (0..7 quadwords) */ - uint8_t block_count; /* only need 1 bits (0 or 1) */ - uint8_t end_block_count;/* only need 1 bits (0 or 1) */ - uint32_t unused; + uint8_t tail_bytes[7]; + uint8_t count; }; }; -union fi_opx_hfi1_packet_payload { - uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; +static_assert(sizeof(((union fi_opx_hfi1_rzv_rts_immediate_info *)0)->tail_bytes) == OPX_IMMEDIATE_TAIL_BYTE_COUNT, + "sizeof(immediate_info->tail_bytes) must be equal to OPX_IMMEDIATE_TAIL_BYTE_COUNT!"); + +/* Cache "blocked" payloads in 16B are currently "tricky". + * The sender will always send 1 QW of header after SOP so STORE'ing + * a full cacheline block is not possible. The payload will + * arrive cacheline aligned in the eager buffer but not in the + * same "blocks" as written. + * + * For example, contiguous rzv: + * + * STORE(tag + 7 qw's of CACHELINE 0 unused[1], not unused[2] as in 9B above) + * fi_opx_init_hfi_lookupoptionally STORE(icrc/tail) if no more immediate data + * + * STORE(full block of immediate fragment unaligned data) + * STORE(full block of immediate data) + * STORE(full block of immediate end data) + * STORE(icrc/tail) + */ + +struct opx_payload_rzv_contig { + /* ==== CACHE LINE 0 ==== */ + + uintptr_t src_vaddr; + uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ + uint64_t src_device_id; + uint64_t src_iface; + uint64_t immediate_info; + uintptr_t origin_byte_counter_vaddr; + uint64_t unused; + + /* ==== CACHE LINE 1 (WFR/9B only) ==== */ union { struct { - /* ==== CACHE LINE 0 ==== */ + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + }; - uintptr_t src_vaddr; - uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ - uint64_t src_device_id; - uint64_t src_iface; - uint64_t immediate_info; - uintptr_t origin_byte_counter_vaddr; - uint64_t unused[2]; + union cacheline cache_line_1; + }; - /* ==== CACHE LINE 1 ==== */ - union { - struct { - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; - }; + /* ==== CACHE LINE 2-127 ==== */ - union cacheline cache_line_1; - }; + union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; - /* ==== CACHE LINE 2-127 ==== */ +}; - union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; +/* 9B and common payload structure */ +union fi_opx_hfi1_packet_payload { + uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; + uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; + union { + struct { + uint64_t contig_9B_padding; + struct opx_payload_rzv_contig contiguous; + }; + struct opx_payload_rzv_contig contiguous_16B; - } contiguous; struct { /* ==== CACHE LINE 0 ==== */ @@ -921,7 +1757,6 @@ union fi_opx_hfi1_packet_payload { /* ==== CACHE LINE 1 ==== */ uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; } tid_cts; - } __attribute__((__aligned__(32))); static_assert(sizeof(union fi_opx_hfi1_packet_payload) <= FI_OPX_HFI1_PACKET_MTU, @@ -942,7 +1777,6 @@ static_assert((offsetof(union fi_opx_hfi1_packet_payload, tid_cts.tidpairs) + "If you added/removed fields in struct tid_cts, you need to adjust FI_OPX_MAX_DPUT_TIDPAIRS!"); - struct fi_opx_hfi1_ue_packet_slist; struct fi_opx_hfi1_ue_packet { /* == CACHE LINE 0 == */ @@ -968,17 +1802,18 @@ struct fi_opx_hfi1_ue_packet { uint32_t unused_cacheline0; - /* == CACHE LINE 1 == */ + /* == CACHE LINE 1, 2 == */ uint64_t unused_cacheline1; - union fi_opx_hfi1_packet_hdr hdr; + union opx_hfi1_packet_hdr hdr; - /* == CACHE LINE 2 == */ - union fi_opx_hfi1_packet_payload payload; + /* == CACHE LINE 3 == */ + union fi_opx_hfi1_packet_payload payload; } __attribute__((__packed__)) __attribute__((aligned(64))); static_assert(offsetof(struct fi_opx_hfi1_ue_packet, unused_cacheline1) == 64, "struct fi_opx_hfi1_ue_packet->unused_cacheline1 should be aligned on cache boundary!"); -static_assert(offsetof(struct fi_opx_hfi1_ue_packet, payload) == 128, + +static_assert(offsetof(struct fi_opx_hfi1_ue_packet, payload) == 192, "struct fi_opx_hfi1_ue_packet->payload should be aligned on cache boundary!"); struct fi_opx_hfi1_ue_packet_slist { @@ -1087,6 +1922,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_hfi1_ue_packet_slist_remove_item (struct fi return next_item; } +#if 0 static inline void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr, const char * fn, const unsigned ln) { @@ -1131,6 +1967,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: fprintf(stderr, "(%d) %s():%u .inject.message_length .............. 0x%02x\n", pid, fn, ln, hdr->inject.message_length); fprintf(stderr, "(%d) %s():%u .inject.app_data_u64[0] 0x%016lx\n", @@ -1140,6 +1978,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: fprintf(stderr, "(%d) %s():%u .send.xfer_bytes_tail ............... 0x%02x\n", pid, fn, ln, hdr->send.xfer_bytes_tail); fprintf(stderr, "(%d) %s():%u .send.payload_qws_total 0x%04x\n", @@ -1197,6 +2037,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: /* calculate (?) total bytes to be transfered */ break; default: break; @@ -1206,5 +2048,6 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr return; } +#endif #endif /* _FI_PROV_OPX_HFI1_PACKET_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 72ae427de94..081058eb5da 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -92,7 +92,8 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep *opx_ep, volatile uint64_t *rhe_ptr, volatile uint32_t *rhf_ptr, const uint32_t rhf_msb, const uint32_t rhf_lsb, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ void fi_opx_hfi1_update_hdrq_head_register(struct fi_opx_ep *opx_ep, const uint64_t hdrq_offset) @@ -107,12 +108,16 @@ void fi_opx_hfi1_update_hdrq_head_register(struct fi_opx_ep *opx_ep, const uint6 __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint64_t rhf) + const union opx_hfi1_packet_hdr *const hdr, + const uint64_t rhf, + const uint64_t slid, + const uint64_t dlid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { /* "eager" packet - has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const __attribute__((unused)) payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * (uintptr_t)opx_ep->rx->egrq.elemsz + @@ -120,13 +125,6 @@ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, assert(payload != NULL); - /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t __attribute__((unused)) total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t __attribute__((unused)) payload_bytes_to_copy = - total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); - /* currently no eager UD packets are defined */ fprintf(stderr, "%s:%s():%d bad ud eager packet; abort.\n", __FILE__, __func__, __LINE__); @@ -142,7 +140,8 @@ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, static void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const uint64_t slid) { struct fi_opx_reliability_service *service = opx_ep->reliability->state.service; @@ -165,7 +164,7 @@ void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, ->pending_rx_reliability_pool); assert(ping_op != NULL); ping_op->ud_opcode = hdr->ud.opcode; - ping_op->slid = (uint64_t)hdr->stl.lrh.slid; + ping_op->slid = slid; ping_op->rx = (uint64_t)hdr->service.origin_reliability_rx; ping_op->key.key = hdr->service.key; ping_op->psn_count = hdr->service.psn_count; @@ -185,7 +184,7 @@ void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_ack(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { const uint64_t key = hdr->service.key; const uint64_t psn_count = hdr->service.psn_count; @@ -198,7 +197,7 @@ void fi_opx_hfi1_handle_ud_ack(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_nack(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { const uint64_t key = hdr->service.key; const uint64_t psn_count = hdr->service.psn_count; @@ -211,15 +210,19 @@ void fi_opx_hfi1_handle_ud_nack(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, - const uint64_t rhf) + const uint64_t rhf, + const uint64_t slid, + const uint64_t dlid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { - if (OFI_LIKELY(!OPX_RHF_IS_USE_EGR_BUF(rhf))) { - /* "header only" packet - no payload */ + /* "header only" packet - no payload */ + if (OFI_LIKELY(!OPX_RHF_IS_USE_EGR_BUF(rhf, hfi1_type))) { switch(hdr->ud.opcode) { case FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING: - fi_opx_hfi1_handle_ud_ping(opx_ep, hdr); + fi_opx_hfi1_handle_ud_ping(opx_ep, hdr, slid); break; case FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK: fi_opx_hfi1_handle_ud_ack(opx_ep, hdr); @@ -248,12 +251,12 @@ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, fprintf(stderr, "%s:%s():%d bad ud header packet; abort.\n", __FILE__, __func__, __LINE__); abort(); - }; + } } else { - fi_opx_hfi1_handle_ud_eager_packet(opx_ep, hdr, rhf); + fi_opx_hfi1_handle_ud_eager_packet(opx_ep, hdr, rhf, slid, dlid, pktlen, hfi1_type); } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; @@ -264,7 +267,7 @@ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf) { @@ -273,11 +276,11 @@ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, * Error injection .. purposefully drop packet */ if (OFI_UNLIKELY(FI_OPX_RELIABILITY_RX_DROP_PACKET(&opx_ep->reliability->state, hdr))) { - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,OPX_HFI1_TYPE); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; - if (OPX_RHF_IS_USE_EGR_BUF(rhf)) { /* eager */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); + if (OPX_RHF_IS_USE_EGR_BUF(rhf,OPX_HFI1_TYPE)) { /* eager */ + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,OPX_HFI1_TYPE); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { OPX_HFI1_BAR_STORE(opx_ep->rx->egrq.head_register, @@ -296,27 +299,30 @@ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, - uint8_t *origin_rx, const uint64_t rhf) + uint8_t *origin_rx, const uint64_t rhf, + const uint64_t slid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { /* * Check for 'reliability' exceptions */ - const uint64_t slid = hdr->stl.lrh.slid; const uint64_t origin_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(hdr); const uint64_t psn = FI_OPX_HFI1_PACKET_PSN(hdr); + if (OFI_UNLIKELY(fi_opx_reliability_rx_check(&opx_ep->reliability->state, slid, origin_tx, psn, origin_rx) == FI_OPX_RELIABILITY_EXCEPTION)) { - if (!OPX_RHF_IS_USE_EGR_BUF(rhf)) { + if (!OPX_RHF_IS_USE_EGR_BUF(rhf,hfi1_type)) { /* no payload */ fi_opx_reliability_rx_exception(&opx_ep->reliability->state, slid, - origin_tx, psn, &opx_ep->ep_fid, hdr, NULL); + origin_tx, psn, &opx_ep->ep_fid, hdr, NULL, pktlen, hfi1_type); } else { /* has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * @@ -326,7 +332,7 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, assert(payload != NULL); fi_opx_reliability_rx_exception(&opx_ep->reliability->state, slid, origin_tx, psn, &opx_ep->ep_fid, hdr, - payload); + payload, pktlen, hfi1_type); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -336,7 +342,7 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, } } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); @@ -348,40 +354,43 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, const int lock_required, const enum ofi_reliability_kind reliability, const uint8_t origin_rx, - const uint64_t rhf) + const uint64_t rhf, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid, + const uint16_t pktlen) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "================ received a packet from the fabric\n"); - if (!OPX_RHF_IS_USE_EGR_BUF(rhf)) { - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { + if (!OPX_RHF_IS_USE_EGR_BUF(rhf,hfi1_type)) { + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { /* "header only" packet - no payload */ fi_opx_ep_rx_process_header(&opx_ep->ep_fid, hdr, NULL, 0, FI_TAGGED, - FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, + opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); - - } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + lock_required, reliability, + hfi1_type, slid); + } else if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, NULL, 0, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } else { fi_opx_ep_rx_process_header_msg(&opx_ep->ep_fid, hdr, NULL, 0, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } } else { /* "eager" packet - has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * (uintptr_t)opx_ep->rx->egrq.elemsz + @@ -390,31 +399,43 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, assert(payload != NULL); /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = - total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = + total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = + total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)) { + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)) { fi_opx_ep_rx_process_header( &opx_ep->ep_fid, hdr, (const union fi_opx_hfi1_packet_payload *const)payload, - payload_bytes_to_copy, FI_TAGGED, FI_OPX_HFI_BTH_OPCODE_TAG_EAGER, + payload_bytes_to_copy, FI_TAGGED, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); - } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) { /* all other "tag" packets */ + lock_required, reliability, + hfi1_type, + slid); + } else if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, payload, payload_bytes_to_copy, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } else { fi_opx_ep_rx_process_header_msg(&opx_ep->ep_fid, hdr, payload, payload_bytes_to_copy, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -426,7 +447,7 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, FLIGHT_RECORDER_PACKET_HDR(opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr); } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); @@ -449,14 +470,12 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, opx_ep->reliability->state.rx, psn - opx_ep->reliability->service.preemptive_ack_rate + 1, /* psn_start */ opx_ep->reliability->service.preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); - - } else if (hdr->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA && - ((ntohl(hdr->stl.bth.psn) & 0x80000000) || - (hdr->dput.target.opcode == FI_OPX_HFI_DPUT_OPCODE_PUT))) { - /* Send preemptive ACKs on Rendezvous FI_OPX_HFI_DPUT_OPCODE_PUT or - * on the final packet of a Rendezvous SDMA writev (the high bit - * of the PSN - the Acknowledge Request bit - is set) + hdr, origin_rx, slid, hfi1_type); + + } else if (hdr->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA && + (ntohl(hdr->bth.psn) & 0x80000000)) { + /* Send preemptive ACKs on Rendezvous Data packets when + * the high bit of the PSN - the Acknowledge Request bit - is set */ uint32_t psn_count = MAX(MIN(opx_ep->reliability->service.preemptive_ack_rate, psn), 1); assert(psn >= psn_count - 1); @@ -466,16 +485,10 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, opx_ep->reliability->state.rx, psn - psn_count + 1, /* psn_start */ psn_count, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } } -/* - * ============================================================================ - * Write CSR software trigger from host software by writing MISC_GPIO_OUT = 0x4 - * ============================================================================ -*/ - /* * ============================================================================ * THIS IS THE HFI POLL FUNCTION @@ -484,7 +497,8 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const uint64_t local_hdrq_mask = (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME) ? @@ -496,60 +510,94 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, volatile uint32_t *rhf_ptr = opx_ep->rx->hdrq.rhf_base + hdrq_offset; const uint64_t rhf_rcvd = *((volatile uint64_t *)rhf_ptr); + uint32_t slid, dlid; + uint16_t pktlen; const uint64_t rhf_seq = opx_ep->rx->state.hdrq.rhf_seq; /* The software must look at the RHF.RcvSeq. * If it detects the next sequence number in the entry, the new header * was written into memory. Otherwise, do not process RHF - no packet. */ - if (OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd)) { + if (OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type)) { const uint32_t rhf_msb = rhf_rcvd >> 32; +#ifdef OPX_JKR_DEBUG FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "OPX_RHF_SEQ_MATCH = %d rhf_rcvd = %#lx rhf_seq = %#lx\n", - OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd), rhf_rcvd, rhf_seq); + OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type), rhf_rcvd, rhf_seq); +#endif const uint64_t hdrq_offset_dws = (rhf_msb >> 12) & 0x01FFu; - uint32_t *pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + - 2 + /* rhf field size in dw */ - hdrq_offset_dws; - - const union fi_opx_hfi1_packet_hdr *const hdr = (union fi_opx_hfi1_packet_hdr *)pkt; + uint32_t *pkt; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + assert(hdrq_offset_dws); /* need padding before this header */ + pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + + 2 /* rhf field size in dw */ + - 2 /* sizeof(uint64_t) in dw, offset back to align + for the 9B padding in the header union */ + + hdrq_offset_dws; + } else { + assert(((union opx_jkr_rhf)rhf_rcvd).L2Type == 0x2); + pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + + 2 /* rhf field size in dw */ + + hdrq_offset_dws; + /* Assert we got full expected kdeth split header. + * In the future, we may handle this so this is + * not part of OPX_RHF_CHECK_HEADER */ + assert(!(((union opx_jkr_rhf)rhf_rcvd).KHdrLenErr)); + } - const uint8_t opcode = hdr->stl.bth.opcode; + const union opx_hfi1_packet_hdr *const hdr = (union opx_hfi1_packet_hdr *)pkt; + const uint8_t opcode = hdr->bth.opcode; /* If there's an RHF/RHE error or a bad header detected, handle the error and return */ - if(OPX_RHF_CHECK_HEADER(rhf_rcvd, hdr)) { + if(OPX_RHF_CHECK_HEADER(rhf_rcvd, hdr, hfi1_type)) { const uint32_t rhf_lsb = rhf_rcvd & 0xFFFFFFFF; volatile uint64_t *rhe_ptr = opx_ep->rx->hdrq.rhe_base; - return fi_opx_hfi1_handle_poll_error(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr); + return fi_opx_hfi1_handle_poll_error(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr, hfi1_type); } + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint32_t)hdr->lrh_9B.slid; + pktlen = (uint32_t)hdr->lrh_9B.pktlen; /* pass it down unchanged. lower layers handle BE/LE */ + dlid = (uint32_t)hdr->lrh_9B.dlid; + } else { + slid = htons((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid)); /* BE for lower layers */ + pktlen = (uint16_t) hdr->lrh_16B.pktlen; /* pass it down unchanged. lower layers handle BE/LE */ + dlid = htons(((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid))); /* BE for lower layers */ + } + + if (OFI_UNLIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_UD)) { assert(reliability == OFI_RELIABILITY_KIND_ONLOAD); /* * process "unreliable datagram" packets first - before all the * software reliability protocol checks. */ - return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd); + return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd, + slid, dlid, pktlen, hfi1_type); } uint8_t origin_rx; /* - * check for software reliability events - */ + * check for software reliability events + */ /* This error inject call will compile out in optimized builds */ unsigned rc = fi_opx_hfi1_error_inject(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd); if (OFI_UNLIKELY(rc != -1)) { return rc; } + rc = fi_opx_hfi1_handle_reliability(opx_ep, hdr, rhf_seq, - hdrq_offset, &origin_rx, rhf_rcvd); + hdrq_offset, &origin_rx, rhf_rcvd, slid, pktlen, hfi1_type); if (OFI_UNLIKELY(rc != -1)) { return rc; } + fi_opx_hfi1_handle_packet(opx_ep, opcode, hdr, rhf_seq, - hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd); + hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd, + hfi1_type, slid, pktlen); return 1; /* one packet was processed */ } return 0; @@ -561,23 +609,34 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, * ============================================================================ */ static inline -void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) +void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); uint64_t pos; struct opx_shm_packet* packet = opx_shm_rx_next(&opx_ep->rx->shm, &pos); - union fi_opx_hfi1_packet_hdr * hdr = (packet) ? - (union fi_opx_hfi1_packet_hdr *) packet->data : NULL; + union opx_hfi1_packet_hdr * hdr = (packet) ? + (union opx_hfi1_packet_hdr *) packet->data : NULL; + uint32_t slid; while (hdr != NULL) { - const uint8_t opcode = hdr->stl.bth.opcode; + const uint8_t opcode = hdr->bth.opcode; uint32_t origin_reliability_rx = hdr->service.origin_reliability_rx; /* DAOS HFI Rank Support: */ if (!opx_ep->daos_info.hfi_rank_enabled) { - assert(hdr->stl.lrh.dlid == opx_ep->rx->self.uid.lid); - assert(hdr->stl.bth.rx == opx_ep->rx->self.hfi1_rx || - hdr->stl.bth.rx == opx_ep->rx->self.reliability_rx); +#ifndef NDEBUG + uint32_t dlid __attribute__ ((unused)); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = hdr->lrh_9B.dlid; + } else { + dlid = htons((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid)); + } + + assert(dlid == opx_ep->rx->self.uid.lid); + assert(hdr->bth.rx == opx_ep->rx->self.hfi1_rx || + hdr->bth.rx == opx_ep->rx->self.reliability_rx); +#endif } else { /* DAOS Persistent Address Support: * No Context Resource Management Framework is supported by OPX to @@ -596,8 +655,16 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) * change due to support for Persistent Addressing. The only reliable field * in the fi_addr is the hfi1_unit. */ - assert(hdr->stl.lrh.dlid == opx_ep->rx->self.uid.lid); +#ifndef NDEBUG + uint32_t dlid __attribute__ ((unused)); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = hdr->lrh_9B.dlid; + } else { + dlid = htons(hdr->lrh_16B.dlid20 << 20 | hdr->lrh_16B.dlid); + } + assert(dlid == opx_ep->rx->self.uid.lid); +#endif /* origin_reliability_rx is HFI rank instead of HFI rx */ origin_reliability_rx = packet->origin_rank; @@ -610,14 +677,22 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) opx_ep->daos_info.rank, opx_ep->rx->shm.segment_key); } - if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = hdr->lrh_9B.slid; + } else { + slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + } + + if (FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { fi_opx_ep_rx_process_header(ep, hdr, NULL, 0, FI_TAGGED, - FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, + opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, lock_required, - OFI_RELIABILITY_KIND_NONE); + OFI_RELIABILITY_KIND_NONE, + hfi1_type, + slid); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_UD) { const uint8_t ud_opcode = hdr->ud.opcode; @@ -642,17 +717,28 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) const uint8_t * const payload = (uint8_t *)(hdr+1); /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } - if (opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { fi_opx_ep_rx_process_header_tag(ep, hdr, payload, payload_bytes_to_copy, opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, - lock_required, OFI_RELIABILITY_KIND_NONE); + lock_required, OFI_RELIABILITY_KIND_NONE, + hfi1_type, slid); } else { @@ -660,72 +746,16 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) payload_bytes_to_copy, opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, - lock_required, OFI_RELIABILITY_KIND_NONE); + lock_required, OFI_RELIABILITY_KIND_NONE, + hfi1_type, slid); } } opx_shm_rx_advance(&opx_ep->rx->shm, (void *)hdr, pos); packet = opx_shm_rx_next(&opx_ep->rx->shm, &pos); - hdr = (packet) ? (union fi_opx_hfi1_packet_hdr *) packet->data : NULL; - } -} - - - -__OPX_FORCE_INLINE__ -void fi_opx_hfi1_poll_many (struct fid_ep *ep, - const int lock_required, - const uint64_t caps, - const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) -{ - /* All callers to this function should have already obtained the necessary lock */ - assert(!lock_required); - - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - - static const unsigned hfi1_poll_max = 256; - unsigned hfi1_poll_count = 0; - unsigned packets = 0; - - - if ((caps & FI_LOCAL_COMM) || (caps == 0)) { - fi_opx_shm_poll_many(ep, 0); - } - - if ((caps & FI_REMOTE_COMM) || (caps == 0)) { - do { - packets = fi_opx_hfi1_poll_once(ep, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask); - } while ((packets > 0) && (hfi1_poll_count++ < hfi1_poll_max)); - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) { /* compile-time constant expression */ - - struct fi_opx_reliability_service *service = opx_ep->reliability->state.service; - - union fi_opx_timer_state *timer = &service->tx.timer; - union fi_opx_timer_stamp *timestamp = &service->tx.timestamp; - uint64_t compare = fi_opx_timer_now(timestamp, timer); - - //TODO: There needs to be feedback from the replay buffer pool into this following if as well - // If the pool is getting full, then send pings out more frequently - - if (OFI_UNLIKELY(compare > service->usec_next)) { - // Drain all coalesced pings - fi_opx_hfi_rx_reliablity_process_requests(ep, PENDING_RX_RELIABLITY_COUNT_MAX); - fi_reliability_service_ping_remote(ep, service); - // Fetch the timer again as it could have taken us a while to get through reliability - fi_opx_timer_now(timestamp, timer); - service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); - }// End timer fired - - - } + hdr = (packet) ? + (union opx_hfi1_packet_hdr *) packet->data : NULL; } - - fi_opx_compiler_msync_writes(); //Workaround for STL-62043 - - - return; } __OPX_FORCE_INLINE__ @@ -750,7 +780,8 @@ void fi_opx_hfi1_poll_sdma_completion(struct fi_opx_ep *opx_ep) hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->errcode = entry->errcode; hfi->info.sdma.queued_entries[hfi->info.sdma.done_index] = NULL; - assert(entry->status == COMPLETE || entry->status == FREE); + assert(entry->status == COMPLETE || entry->status == FREE || + (entry->status == ERROR && entry->errcode != ECOMM)); // If it is a network error, retry ++hfi->info.sdma.available_counter; hfi->info.sdma.done_index = (hfi->info.sdma.done_index + 1) % (queue_size); if (hfi->info.sdma.done_index == hfi->info.sdma.fill_index) { @@ -763,7 +794,107 @@ void fi_opx_hfi1_poll_sdma_completion(struct fi_opx_ep *opx_ep) "===================================== SDMA POLL COMPLETE\n"); } +__OPX_FORCE_INLINE__ +int opx_is_rhf_empty(struct fi_opx_ep *opx_ep, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + const uint64_t local_hdrq_mask = (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME) ? + opx_ep->hfi->info.rxe.hdrq.rx_poll_mask : + hdrq_mask; + const uint64_t hdrq_offset = opx_ep->rx->state.hdrq.head & local_hdrq_mask; + volatile uint32_t *rhf_ptr = opx_ep->rx->hdrq.rhf_base + hdrq_offset; + const uint64_t rhf_rcvd = *((volatile uint64_t *)rhf_ptr); + const uint64_t rhf_seq = opx_ep->rx->state.hdrq.rhf_seq; + + if (!OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type)) { + return 1; + } + return 0; +} +__OPX_FORCE_INLINE__ +void opx_handle_events(struct fi_opx_ep *opx_ep, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + uint64_t events = *(uint64_t *)(opx_ep->hfi->ctrl->base_info.events_bufbase); + if (events & HFI1_EVENT_FROZEN) { + /* reset context only if RHF queue is empty */ + if (opx_is_rhf_empty(opx_ep, hdrq_mask, hfi1_type)) { + opx_reset_context(opx_ep); + opx_hfi_ack_events(opx_ep->hfi->fd, events); + } else { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "Context frozen: Not resetting because packets are present in receive queue\n"); + } + } +} +__OPX_FORCE_INLINE__ +void fi_opx_hfi1_poll_many (struct fid_ep *ep, + const int lock_required, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + /* All callers to this function should have already obtained the necessary lock */ + assert(!lock_required); + + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + + static const unsigned hfi1_poll_max = 256; + unsigned hfi1_poll_count = 0; + unsigned packets = 0; + + + if ((caps & FI_LOCAL_COMM) || (caps == 0)) { + fi_opx_shm_poll_many(ep, 0, hfi1_type); + } + + if ((caps & FI_REMOTE_COMM) || (caps == 0)) { + do { + packets = fi_opx_hfi1_poll_once(ep, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, hfi1_type); + } while ((packets > 0) && (hfi1_poll_count++ < hfi1_poll_max)); + + struct fi_opx_reliability_service *service = &opx_ep->reliability->service; + union fi_opx_timer_state *timer = &service->tx.timer; + union fi_opx_timer_stamp *timestamp = &service->tx.timestamp; + uint64_t compare = fi_opx_timer_now(timestamp, timer); + + //TODO: There needs to be feedback from the replay buffer pool into this following if as well + // If the pool is getting full, then send pings out more frequently + + if (OFI_UNLIKELY(compare > service->usec_next)) { + // Drain all coalesced pings + fi_opx_hfi_rx_reliablity_process_requests(ep, PENDING_RX_RELIABLITY_COUNT_MAX); + fi_reliability_service_ping_remote(ep, service); + // Fetch the timer again as it could have taken us a while to get through reliability + compare = fi_opx_timer_now(timestamp, timer); + service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); + } // End timer fired + + struct fi_opx_hfi1_context *context = opx_ep->hfi; + timer = &context->link_status_timer; + timestamp = &context->link_status_timestamp; + + if (OFI_UNLIKELY(compare > context->status_check_next_usec)) { + int prev_link_status = context->status_lasterr; + int err = fi_opx_context_check_status(context); + // check for hfi event if link is moving from down to up + if ((prev_link_status != FI_SUCCESS) && (err == FI_SUCCESS)) { // check for hfi event if + context->status_lasterr = FI_SUCCESS; /* clear error */ + opx_handle_events(opx_ep, hdrq_mask, hfi1_type); + } + context->status_check_next_usec = fi_opx_timer_next_event_usec(timer, timestamp, OPX_CONTEXT_STATUS_CHECK_INTERVAL_USEC); + } + } + + fi_opx_compiler_msync_writes(); //Workaround for STL-62043 + + + return; +} #endif /* _FI_PROV_OPX_HFI1_PROGRESS_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index 692a0ed625f..76ce0b47455 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -48,9 +48,11 @@ #define OPX_SDMA_REPLAY_DATA_IOV_COUNT (1) #define OPX_SDMA_REPLAY_IOV_COUNT (OPX_SDMA_REPLAY_DATA_IOV_COUNT + 1) #define OPX_SDMA_HFI_MAX_IOVS_PER_WRITE (64) + OPX_COMPILE_TIME_ASSERT((OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1) == OPX_DEBUG_COUNTERS_WRITEV_MAX, "OPX_DEBUG_COUNTERS_WRITEV_MAX should be OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1!\n"); + // Driver limit of the number of TIDs that can be used in a single SDMA request #define OPX_SDMA_MAX_TIDS_PER_REQUEST (1024) @@ -60,6 +62,7 @@ OPX_COMPILE_TIME_ASSERT((OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1) == OPX_DEBUG_COUNT #define OPX_SDMA_MEMINFO_SIZE (136) #define OPX_SDMA_MEMINFO_SIZE_QWS (OPX_SDMA_MEMINFO_SIZE >> 3) + OPX_COMPILE_TIME_ASSERT((OPX_SDMA_MEMINFO_SIZE & 0x7) == 0, "OPX_SDMA_MEMINFO_SIZE must be a multiple of 8!"); #ifdef OPX_HMEM OPX_COMPILE_TIME_ASSERT(sizeof(struct sdma_req_meminfo) == OPX_SDMA_MEMINFO_SIZE, @@ -67,6 +70,7 @@ OPX_COMPILE_TIME_ASSERT(sizeof(struct sdma_req_meminfo) == OPX_SDMA_MEMINFO_SIZE #endif + static const uint16_t OPX_SDMA_REQ_SET_MEMINFO[2] = {0, #ifdef OPX_HMEM ((uint16_t) 1) << HFI1_SDMA_REQ_MEMINFO_SHIFT @@ -95,8 +99,7 @@ struct fi_opx_hfi1_sdma_header_vec { #endif } hmem; }; - - struct fi_opx_hfi1_txe_scb scb; + union opx_hfi1_txe_scb_union scb; }; static const size_t OPX_SDMA_REQ_INFO_OFFSET[2] = { @@ -130,7 +133,7 @@ struct opx_sdma_request { /* ==== CACHELINE 1 ==== */ struct iovec iovecs[OPX_SDMA_REQUEST_IOVS]; - struct fi_opx_hfi1_sdma_header_vec header_vec; // 72 bytes or 208 bytes (OPX_HMEM) + struct fi_opx_hfi1_sdma_header_vec header_vec; // 72 bytes 9B or 80 bytes 16B, plus 136 bytes (OPX_HMEM) }; OPX_COMPILE_TIME_ASSERT(offsetof(struct opx_sdma_request, iovecs) == FI_OPX_CACHE_LINE_SIZE, "Offset of opx_sdma_request->iovecs should start at cacheline 1!"); @@ -467,7 +470,7 @@ __OPX_FORCE_INLINE__ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, void *requester, enum opx_sdma_comp_state *requester_comp_state, - struct fi_opx_hfi1_txe_scb *source_scb, + union opx_hfi1_txe_scb_union *source_scb, struct iovec *iovs, const uint16_t num_iovs, const uint16_t num_packets, @@ -510,13 +513,21 @@ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, /* Set the Acknowledge Request Bit if we're only sending one packet */ uint64_t set_ack_bit = (num_packets == 1) ? (uint64_t)htonl(0x80000000) : 0; - request->header_vec.scb = *source_scb; - request->header_vec.scb.hdr.qw[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; - request->header_vec.scb.hdr.qw[4] |= (last_packet_bytes << 32); - request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo]; request->iovecs[0].iov_base = req_info; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + request->header_vec.scb.scb_9B = (source_scb->scb_9B); + request->header_vec.scb.scb_9B.hdr.qw_9B[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; + request->header_vec.scb.scb_9B.hdr.qw_9B[4] |= (last_packet_bytes << 32); + request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo]; + } else { + request->header_vec.scb.scb_16B = (source_scb->scb_16B); + request->header_vec.scb.scb_16B.hdr.qw_16B[3] |= ((uint64_t)kdeth << 32) | set_ack_bit; + request->header_vec.scb.scb_16B.hdr.qw_16B[5] |= (last_packet_bytes << 32); + request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo] + 8; // extra QWORD in 16B LRH + } + for (int i = 0; i < num_iovs; ++i) { request->iovecs[i + 1] = iovs[i]; } @@ -550,7 +561,7 @@ int opx_hfi1_sdma_enqueue_replay(struct fi_opx_ep *opx_ep, FI_OPX_HFI1_SDMA_REQ_HEADER_EAGER_FIXEDBITS, replay->hmem_iface, replay->hmem_device, - 0, // last packet bytes unused for replays + replay->scb.scb_9B.hdr.dput.target.bytes, // last packet bytes 0 // kdeth tid info unused for replays ); } @@ -588,7 +599,12 @@ uint16_t opx_hfi1_sdma_register_replays(struct fi_opx_ep *opx_ep, uint32_t fragsize = 0; for (int i = 0; i < we->num_packets; ++i) { fragsize = MAX(fragsize, we->packets[i].length); - we->packets[i].replay->scb.hdr.qw[2] |= (uint64_t)htonl((uint32_t)psn); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + we->packets[i].replay->scb.scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); + } else { + we->packets[i].replay->scb.scb_16B.hdr.qw_16B[3] |= (uint64_t)htonl((uint32_t)psn); + } + we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; we->packets[i].replay->hmem_iface = we->hmem.iface; diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 6dd6aea096f..344ab734e4b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -75,8 +75,8 @@ // Function for performing FI_INJECT_COMPLETIONs. __OPX_FORCE_INLINE__ -void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, - void *context, +ssize_t fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, + void *user_context, const size_t len, const int lock_required, const uint64_t tag, @@ -91,25 +91,35 @@ void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, /* initialize the completion entry */ struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ assert((caps & (FI_TAGGED | FI_MSG)) != (FI_TAGGED | FI_MSG)); - union fi_opx_context * opx_context = (union fi_opx_context *)context; - opx_context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); - opx_context->len = len; - opx_context->buf = NULL; /* receive data buffer */ - opx_context->byte_counter = 0; - opx_context->tag = tag; - opx_context->next = NULL; + + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); + context->len = len; + context->buf = NULL; /* receive data buffer */ + context->byte_counter = 0; + context->tag = tag; + context->next = NULL; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== TX CQ COMPLETION QUEUED\n"); - fi_opx_context_slist_insert_tail(opx_context, opx_ep->tx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->tx->cq_completed_ptr); + + return FI_SUCCESS; } // faster than memcpy() for this amount of data. -// DOES NOT SUPPORT SCB (PIO or UREG) COPY (SIM) -static inline void fi_opx_copy_cacheline(volatile uint64_t dest[8], uint64_t source[8]) +// DOES NOT SUPPORT SCB (PIO or UREG) (does not support SIM/BAR) +// Unstructured copy - for payloads or other memcpy replacement +__OPX_FORCE_INLINE__ +void fi_opx_copy_cacheline(uint64_t dest[8], uint64_t source[8]) { dest[0] = source[0]; dest[1] = source[1]; @@ -122,9 +132,65 @@ static inline void fi_opx_copy_cacheline(volatile uint64_t dest[8], uint64_t sou } +// faster than memcpy() for this amount of data. +// DOES NOT SUPPORT SCB (PIO or UREG) (does not support SIM/BAR) +// Structured copy - for headers +static inline void fi_opx_copy_hdr9B_cacheline(struct fi_opx_hfi1_txe_scb_9B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_9B[0] = source[1]; + dest->hdr.qw_9B[1] = source[2]; + dest->hdr.qw_9B[2] = source[3]; + dest->hdr.qw_9B[3] = source[4]; + dest->hdr.qw_9B[4] = source[5]; + dest->hdr.qw_9B[5] = source[6]; + dest->hdr.qw_9B[6] = source[7]; +} + +static inline void fi_opx_copy_hdr16B_cacheline(struct fi_opx_hfi1_txe_scb_16B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_16B[0] = source[1]; + dest->hdr.qw_16B[1] = source[2]; + dest->hdr.qw_16B[2] = source[3]; + dest->hdr.qw_16B[3] = source[4]; + dest->hdr.qw_16B[4] = source[5]; + dest->hdr.qw_16B[5] = source[6]; + dest->hdr.qw_16B[6] = source[7]; + dest->hdr.qw_16B[7] = source[8]; // cacheline + 1 spillover + dest->hdr.qw_16B[8] = 0UL; + dest->hdr.qw_16B[9] = 0UL; + dest->hdr.qw_16B[10] = 0UL; + dest->hdr.qw_16B[11] = 0UL; + dest->hdr.qw_16B[12] = 0UL; + dest->hdr.qw_16B[13] = 0UL; + dest->hdr.qw_16B[14] = 0UL; +} + + +static inline void fi_opx_copy_hdr16B_2cacheline(struct fi_opx_hfi1_txe_scb_16B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_16B[0] = source[1]; + dest->hdr.qw_16B[1] = source[2]; + dest->hdr.qw_16B[2] = source[3]; + dest->hdr.qw_16B[3] = source[4]; + dest->hdr.qw_16B[4] = source[5]; + dest->hdr.qw_16B[5] = source[6]; + dest->hdr.qw_16B[6] = source[7]; + dest->hdr.qw_16B[7] = source[8]; + dest->hdr.qw_16B[8] = source[9]; + dest->hdr.qw_16B[9] = source[10]; + dest->hdr.qw_16B[10] = source[11]; + dest->hdr.qw_16B[11] = source[12]; + dest->hdr.qw_16B[12] = source[13]; + dest->hdr.qw_16B[13] = source[14]; + dest->hdr.qw_16B[14] = source[15]; +} + // faster than memcpy() for this amount of data. // SCB (PIO or UREG) COPY ONLY (STORE) -static inline void fi_opx_copy_scb(volatile uint64_t dest[8], uint64_t source[8]) +static inline void fi_opx_store_scb_qw(volatile uint64_t dest[8], const uint64_t source[8]) { OPX_HFI1_BAR_STORE(&dest[0], source[0]); OPX_HFI1_BAR_STORE(&dest[1], source[1]); @@ -136,10 +202,67 @@ static inline void fi_opx_copy_scb(volatile uint64_t dest[8], uint64_t source[8] OPX_HFI1_BAR_STORE(&dest[7], source[7]); } + +// Use this to fill out an SCB before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +static inline void fi_opx_store_and_copy_scb_9B(volatile uint64_t scb[8], + struct fi_opx_hfi1_txe_scb_9B *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) +{ + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], d7); + local->qw0 = d0; + local->hdr.qw_9B[0] = d1; + local->hdr.qw_9B[1] = d2; + local->hdr.qw_9B[2] = d3; + local->hdr.qw_9B[3] = d4; + local->hdr.qw_9B[4] = d5; + local->hdr.qw_9B[5] = d6; + local->hdr.qw_9B[6] = d7; +} + // Use this to fill out an SCB before the data is copied to local storage. // (The local copy is usually used for setting up replay buffers or for log // messages.) -static inline void fi_opx_set_scb(volatile uint64_t scb[8], uint64_t local[8], +static inline void fi_opx_store_and_copy_scb_16B(volatile uint64_t scb[8], + struct fi_opx_hfi1_txe_scb_16B *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) +{ + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], d7); + + local->qw0 = d0; + local->hdr.qw_16B[0] = d1; + local->hdr.qw_16B[1] = d2; + local->hdr.qw_16B[2] = d3; + local->hdr.qw_16B[3] = d4; + local->hdr.qw_16B[4] = d5; + local->hdr.qw_16B[5] = d6; + local->hdr.qw_16B[6] = d7; + +} +// Use this to fill out a payload before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +// +// Common to 9B/16B for temporary local storage (generic QW[] scb's) +static inline void fi_opx_store_and_copy_qw(volatile uint64_t scb[8], uint64_t local[8], uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) { @@ -193,14 +316,97 @@ void fi_opx_duff_copy(char *to, const char *from, int64_t len) { } } +// Use this to fill out an SCB before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +// +// Different from fi_opx_store_and_copy_qw because it moves < 1 QW of data +// into the correct qw for 9B headers +static inline void fi_opx_store_and_copy_qw_9B(volatile uint64_t scb[8], + uint64_t local[8], + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, const void *buf, size_t len, uint64_t d7) +{ + // less than a qw to store + local[6] = 0; + // the purpose of this is to quickly copy the contents of buf into + // the 6th DWORD of the SCB and the local copy. + if (len > 7) { + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); + abort(); + } else if (len > 0) { + fi_opx_duff_copy((char*)&local[6], buf, len); + } + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], local[6]); + OPX_HFI1_BAR_STORE(&scb[7], d7); + local[0] = d0; + local[1] = d1; + local[2] = d2; + local[3] = d3; + local[4] = d4; + local[5] = d5; +// local[6] = d6; + local[7] = d7; +} + // Use this to fill out an SCB before the data is copied to local storage. // (The local copy is usually used for setting up replay buffers or for log // messages.) // -// This version embeds up to 16 bytes of immediate data into the SCB. +// Different from fi_opx_store_and_copy_qw because it moves < 1 QW of data +// into the correct qw for 16B headers +static inline void fi_opx_store_and_copy_qw_16B(volatile uint64_t scb[8], + uint64_t local[8], + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, const void *buf, size_t len) +{ + // less than a qw to store + local[7] = 0; + // the purpose of this is to quickly copy the contents of buf into + // the 7th DWORD of the SCB and the local copy. + if (len > 7) { + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); + abort(); + } else if (len > 0) { + fi_opx_duff_copy((char*)&local[7], buf, len); + } + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], local[7]); + local[0] = d0; + local[1] = d1; + local[2] = d2; + local[3] = d3; + local[4] = d4; + local[5] = d5; + local[6] = d6; +// local[7] = d7; +} + +// Use this to fill out a PIO SOP SCB before the data is copied to local +// storage. (The local copy is usually used for setting up replay buffers +// or for log messages.) +// +// These versions embeds up to 16 bytes of immediate data into the SCB. +// Header only - no additional payload expected - +// 9B is one call/one cacheline __OPX_FORCE_INLINE__ -void fi_opx_set_scb_special(volatile uint64_t scb[8], uint64_t local[8], +void fi_opx_store_inject_and_copy_scb_9B(volatile uint64_t scb[8], + uint64_t *local, uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, uint64_t d4, const void *buf, size_t len, uint64_t d7) { @@ -306,39 +512,157 @@ void fi_opx_set_scb_special(volatile uint64_t scb[8], uint64_t local[8], local[7] = d7; } -// Use this to fill out an SCB before the data is copied to local storage. -// (The local copy is usually used for setting up replay buffers or for log -// messages.) -static inline void fi_opx_set_scb_special2(volatile uint64_t scb[8], uint64_t local[8], - uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, - uint64_t d4, uint64_t d5, const void *buf, size_t len, uint64_t d7) +// Use this to fill out a PIO SOP SCB before the data is copied to local +// storage. (The local copy is usually used for setting up replay buffers +// or for log messages.) +// +// These versions embeds up to 16 bytes of immediate data into the SCB. +// Header only - no additional payload expected - +// 16B is two calls/two cachelines, second cacheline is padded out +__OPX_FORCE_INLINE__ +void fi_opx_store_inject_and_copy_scb_16B(volatile uint64_t scb[8], + uint64_t *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, const void *buf, size_t len) { - local[6] = 0; - memcpy((void*)&local[6], buf, len); - OPX_HFI1_BAR_STORE(&scb[0], d0); - OPX_HFI1_BAR_STORE(&scb[1], d1); - OPX_HFI1_BAR_STORE(&scb[2], d2); - OPX_HFI1_BAR_STORE(&scb[3], d3); - OPX_HFI1_BAR_STORE(&scb[4], d4); - OPX_HFI1_BAR_STORE(&scb[5], d5); - OPX_HFI1_BAR_STORE(&scb[6], local[6]); - OPX_HFI1_BAR_STORE(&scb[7], d7); + // the purpose of this is to quickly copy the contents of buf into + // the 5th and 6th DWORDs of the SCB and the local copy. + switch (len) { + case 0: + local[6] = 0; + local[7] = 0; + break; + case 1: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 1); + local[7] = 0; + break; + case 2: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 2); + local[7] = 0; + break; + case 3: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 3); + local[7] = 0; + break; + case 4: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 4); + local[7] = 0; + break; + case 5: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 5); + local[7] = 0; + break; + case 6: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 6); + local[7] = 0; + break; + case 7: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 7); + local[7] = 0; + break; + case 8: + local[6] = *((uint64_t*)buf); + local[7] = 0; + break; + case 9: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 9); + break; + case 10: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 10); + break; + case 11: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 11); + break; + case 12: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 12); + break; + case 13: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 13); + break; + case 14: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 14); + break; + case 15: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 15); + break; + case 16: + local[6] = *((uint64_t*)buf); + local[7] = *((uint64_t*)buf+1); + break; + default: + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); + break; + } + + //OPX_DEBUG_PRINT_PBC_HDR_QW(d0, d1, d2, d3, d4, OPX_HFI1_JKR); + + // 1st cacheline PIO SOP + OPX_HFI1_BAR_STORE(&scb[0], d0); //pbc + OPX_HFI1_BAR_STORE(&scb[1], d1); //lrh + OPX_HFI1_BAR_STORE(&scb[2], d2); //lrh + OPX_HFI1_BAR_STORE(&scb[3], d3); //bth + OPX_HFI1_BAR_STORE(&scb[4], d4); //bth + kdeth + OPX_HFI1_BAR_STORE(&scb[5], d5); //kdeth + OPX_HFI1_BAR_STORE(&scb[6], local[6]); //data 1 + OPX_HFI1_BAR_STORE(&scb[7], local[7]); //data 2 + local[0] = d0; local[1] = d1; local[2] = d2; local[3] = d3; local[4] = d4; local[5] = d5; - // local[6] = d6; - local[7] = d7; + // local[6] + // local[7] +} + +__OPX_FORCE_INLINE__ +void fi_opx_store_inject_and_copy_scb2_16B(volatile uint64_t scb[8], + uint64_t *local, uint64_t d8) +{ + // 2nd cacheline PIO (only) padded out + + OPX_HFI1_BAR_STORE(&scb[0], d8); // tag + OPX_HFI1_BAR_STORE(&scb[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[7], OPX_JKR_16B_PAD_QWORD); + + local[8] = d8; } +void fi_opx_hfi1_rx_rzv_rts_etrunc (struct fi_opx_ep *opx_ep, + const union opx_hfi1_packet_hdr * const hdr, + const uint8_t u8_rx, + uintptr_t origin_byte_counter_vaddr, + const unsigned is_intranode, + const enum ofi_reliability_kind reliability, + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); + void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, - const void * const hdr, const void * const payload, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, const uint8_t u8_rx, const uint64_t niov, uintptr_t origin_byte_counter_vaddr, - union fi_opx_context *const target_context, + struct opx_context *const target_context, const uintptr_t dst_vaddr, const enum fi_hmem_iface dst_iface, const uint64_t dst_device, @@ -348,11 +672,12 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, uint8_t opcode, const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx); + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ep, struct fi_opx_mr * opx_mr, - const void * const hdr, const void * const payload, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, const uint8_t origin_rs, const uint32_t niov, @@ -366,7 +691,8 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx void (*completion_action)(union fi_opx_hfi1_deferred_work * work_state), const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type ); union fi_opx_hfi1_deferred_work; @@ -437,7 +763,6 @@ struct fi_opx_hfi1_dput_params { }; }; }; - OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_hfi1_dput_params, compare_iov) & 7) == 0, "compare_iov not 8-byte aligned!"); struct fi_opx_hfi1_rx_rzv_rts_params { @@ -543,9 +868,10 @@ union fi_opx_hfi1_deferred_work { int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work); void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint8_t u8_rx, - const uint32_t u32_extended_rx); + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work *work); int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work *work); @@ -593,17 +919,18 @@ __OPX_FORCE_INLINE__ void fi_opx_force_credit_return(struct fid_ep *ep, fi_addr_t dest_addr, const uint64_t dest_rx, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dws = 16; - const uint16_t lrh_dws = htons(pbc_dws-1); + const uint64_t pbc_dws = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 16 : 20; + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ - const uint64_t force_credit_return = OPX_PBC_CR(0x1); + const uint64_t force_credit_return = OPX_PBC_CR(0x1, hfi1_type); /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -620,33 +947,72 @@ void fi_opx_force_credit_return(struct fid_ep *ep, * credits will be returned soon naturally anyway, and sending a no-op packet * forcing a credit return would just add unnecessary traffic. */ + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + uint64_t available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1); - while (OFI_UNLIKELY(available_credits < 1)) { + while (OFI_UNLIKELY(available_credits < credits_needed)) { if (loop++ & 0x10) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; return; } FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1); + available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed); } volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; /* WHY BOTHER? TODO: REMOVE */ - fi_opx_set_scb(scb, tmp, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) - | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, - opx_ep->tx->send.hdr.qw[2], - opx_ep->tx->send.hdr.qw[3], - opx_ep->tx->send.hdr.qw[4], - 0, 0); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) + | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, + opx_ep->tx->send_9B.hdr.qw_9B[2], + opx_ep->tx->send_9B.hdr.qw_9B[3], + opx_ep->tx->send_9B.hdr.qw_9B[4], + 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) + | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, + opx_ep->tx->send_16B.hdr.qw_16B[3], + opx_ep->tx->send_16B.hdr.qw_16B[4], + opx_ep->tx->send_16B.hdr.qw_16B[5], + 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_and_copy_qw(scb_payload, local_temp, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); opx_ep->tx->pio_state->qw0 = pio_state.qw0; FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -659,7 +1025,7 @@ uint64_t fi_opx_hfi1_tx_is_intranode(struct fi_opx_ep *opx_ep, const union fi_op the source lid is the same as the destination lid) */ return ((caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == FI_LOCAL_COMM) || (((caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == (FI_LOCAL_COMM | FI_REMOTE_COMM)) && - (fi_opx_hfi_is_intranode(addr.uid.lid))); + (opx_lid_is_intranode(addr.uid.lid))); } __OPX_FORCE_INLINE__ @@ -667,15 +1033,18 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; const uint64_t bth_rx = dest_rx << 56; - const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); + const uint64_t lrh_dlid_9B = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); + uint32_t dlid = htons(lrh_dlid_9B >> 16); if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -683,7 +1052,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-INJECT-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -691,34 +1060,54 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, if (!hdr) return rc; #ifdef OPX_HMEM - uint64_t hmem_device; - enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); - - if (iface != FI_HMEM_SYSTEM) { - opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); - buf = opx_ep->hmem_copy_buf; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.inject); + if (buf && len) { + uint64_t hmem_device; + enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); + + if (iface != FI_HMEM_SYSTEM) { + opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.inject); + } } #endif - hdr->qw[0] = opx_ep->tx->inject.hdr.qw[0] | lrh_dlid; - hdr->qw[1] = opx_ep->tx->inject.hdr.qw[1] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->tx->inject_9B.hdr.qw_9B[0] | lrh_dlid_9B; - hdr->qw[2] = opx_ep->tx->inject.hdr.qw[2]; - - hdr->qw[3] = opx_ep->tx->inject.hdr.qw[3] | (((uint64_t)data) << 32); - - hdr->qw[4] = 0; - hdr->qw[5] = 0; - fi_opx_hfi1_memcpy8((void*)&hdr->qw[4], buf, len); - - hdr->qw[6] = tag; + hdr->qw_9B[1] = opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)); + hdr->qw_9B[2] = opx_ep->tx->inject_9B.hdr.qw_9B[2]; + + hdr->qw_9B[3] = opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + + hdr->qw_9B[4] = 0; + hdr->qw_9B[5] = 0; + fi_opx_hfi1_memcpy8((void*)&hdr->qw_9B[4], buf, len); + + hdr->qw_9B[6] = tag; + } else { + hdr->qw_16B[0] = opx_ep->tx->inject_16B.hdr.qw_16B[0] | + ((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B); + hdr->qw_16B[1] = opx_ep->tx->inject_16B.hdr.qw_16B[1] | + (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)); + + hdr->qw_16B[3] = opx_ep->tx->inject_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + hdr->qw_16B[5] = 0; + hdr->qw_16B[6] = 0; + fi_opx_hfi1_memcpy8((void*)&hdr->qw_16B[5], buf, len); + hdr->qw_16B[7] = tag; + } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -736,10 +1125,13 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < + credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; - if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1) { + + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < credits_needed) { return -FI_EAGAIN; } } @@ -749,7 +1141,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, int64_t psn; psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-INJECT-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); @@ -759,58 +1151,84 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } #ifdef OPX_HMEM - uint64_t hmem_device; - enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); + if (buf && len) { + uint64_t hmem_device; + enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); - if (iface != FI_HMEM_SYSTEM) { - opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); - buf = opx_ep->hmem_copy_buf; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.inject); + if (iface != FI_HMEM_SYSTEM) { + opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.inject); + } } #endif volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8] = {0}; + uint64_t local_temp[16] = {0}; - fi_opx_set_scb_special(scb, tmp, - opx_ep->tx->inject.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->inject.hdr.qw[0] | lrh_dlid, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_inject_and_copy_scb_9B(scb, local_temp, + opx_ep->tx->inject_9B.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_9B, hfi1_type), + opx_ep->tx->inject_9B.hdr.qw_9B[0] | lrh_dlid_9B, - opx_ep->tx->inject.hdr.qw[1] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), + opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)), - opx_ep->tx->inject.hdr.qw[2] | psn, - opx_ep->tx->inject.hdr.qw[3] | (((uint64_t)data) << 32), - buf, len, tag); + opx_ep->tx->inject_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + buf, len, tag); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } else { + // 1st cacheline + fi_opx_store_inject_and_copy_scb_16B(scb, local_temp, + opx_ep->tx->inject_16B.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_9B, hfi1_type), + opx_ep->tx->inject_16B.hdr.qw_16B[0] | ((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B), + opx_ep->tx->inject_16B.hdr.qw_16B[1] | (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)), + + opx_ep->tx->inject_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + buf, len ); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - /* consume one credit */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_inject_and_copy_scb2_16B(scb2, local_temp, tag ); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - replay->scb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); + } else { + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); + } - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.uid.lid, addr.reliability_rx, dest_rx, psn_ptr, replay, reliability); + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-INJECT-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -879,7 +1297,8 @@ bool fi_opx_hfi1_fill_from_iov8(const struct iovec *iov, /* In: iovec array * return false; } -static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required); +static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_check_credits(struct fi_opx_ep *opx_ep, @@ -908,33 +1327,162 @@ ssize_t fi_opx_hfi1_tx_check_credits(struct fi_opx_ep *opx_ep, return (ssize_t) total_credits_available; } + __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, - size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, - const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, - const uint64_t hmem_device) +ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, + const struct iovec *iov, size_t niov, + const uint16_t lrh_dws, + const uint64_t lrh_dlid, + const uint64_t bth_rx, + size_t total_len, + const size_t payload_qws_total, + const size_t xfer_bytes_tail, + void *desc, + const union fi_opx_addr *addr, + uint64_t tag, + void *context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + uint64_t tx_op_flags, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { - assert(lock_required == 0); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const union fi_opx_addr addr = { .fi = dest_addr }; - const size_t xfer_bytes_tail = total_len & 0x07ul; - const size_t payload_qws_total = total_len >> 3; - const size_t payload_qws_tail = payload_qws_total & 0x07ul; - - const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; - const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - uint16_t full_block_credits_needed = (total_len >> 6); - uint16_t total_credits_needed = 1 + /* packet header */ - full_block_credits_needed; /* full blocks */ - if(payload_qws_tail || xfer_bytes_tail) { - total_credits_needed += 1; - } + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + &opx_ep->tx->shm, addr->hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, + opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) return rc; + +#ifdef OPX_HMEM + /* Note: This code is duplicated in the internode and intranode + paths at points in the code where we know we'll be able to + proceed with the send, so that we don't waste cycles doing + this, only to EAGAIN because we couldn't get a SHM packet + or credits/replay/psn */ + size_t hmem_niov = 1; + struct iovec hmem_iov; + + /* If the IOVs are GPU-resident, copy all their data to the HMEM + bounce buffer, and then proceed as if we only have a single IOV + that points to the bounce buffer. */ + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, + &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; + } + + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + /* Fill QW 5 from the iovec */ + uint8_t *buf = (uint8_t *)&hdr->qw_9B[5]; + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; + + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(tail_len == 0); + } + hdr->qw_9B[6] = tag; + + union fi_opx_hfi1_packet_payload *const payload = + (union fi_opx_hfi1_packet_payload *)(hdr + 1); + + buf = payload->byte; + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(remain == 0); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, SHM -- EAGER (end)\n"); + return rc; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, + size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + const size_t xfer_bytes_tail = total_len & 0x07ul; + const size_t payload_qws_total = total_len >> 3; + const size_t payload_qws_tail = payload_qws_total & 0x07ul; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + uint16_t full_block_credits_needed = (total_len >> 6); + uint16_t total_credits_needed = 1 + /* packet header */ + full_block_credits_needed; /* full blocks */ + + if(payload_qws_tail || xfer_bytes_tail) { + total_credits_needed += 1; + } const uint64_t pbc_dws = 2 + /* pbc */ 2 + /* lhr */ @@ -942,122 +1490,368 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((total_credits_needed-1) << 4); - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ struct iovec *iov_ptr = (struct iovec *) iov; size_t *niov_ptr = &niov; if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- EAGER (begin)\n"); - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); - uint64_t pos; - ssize_t rc; - union fi_opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( - &opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, - opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); + return fi_opx_hfi1_tx_sendv_egr_intranode(ep, + iov, niov, + lrh_dws, + lrh_dlid, + bth_rx, + total_len, + payload_qws_total, + xfer_bytes_tail, + desc, + &addr, + tag, + context, + data, + lock_required, + dest_rx, + tx_op_flags, + caps, + do_cq_completion, + iface, + hmem_device, + hfi1_type); + } - if (!hdr) return rc; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-HFI"); + + // Even though we're using the reliability service to pack this buffer + // we still want to make sure it will have enough credits available to send + // and allow the user to poll and quiesce the fabric some + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if(OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } #ifdef OPX_HMEM - /* Note: This code is duplicated in the internode and intranode + size_t hmem_niov = 1; + struct iovec hmem_iov; + + /* If the IOVs are GPU-resident, copy all their data to the HMEM + bounce buffer, and then proceed as if we only have a single IOV + that points to the bounce buffer. */ + if (iface != FI_HMEM_SYSTEM) { + unsigned iov_total_len = 0; + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; + } + + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; + + OPX_NO_16B_SUPPORT(hfi1_type); + + replay->scb.scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + replay->scb.scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + replay->scb.scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + replay->scb.scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; + replay->scb.scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + replay->scb.scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + &replay->scb.scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(tail_len == 0); + } + replay->scb.scb_9B.hdr.qw_9B[6] = tag; + + remain = total_len - xfer_bytes_tail; + uint64_t *payload = replay->payload; + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + payload, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + + fi_opx_reliability_client_replay_register_no_update( + &opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); + + fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + ssize_t rc; + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- EAGER (end)\n"); + + return rc; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, + const struct iovec *iov, size_t niov, + const uint16_t lrh_qws, + const uint64_t lrh_dlid, + const uint64_t bth_rx, + size_t total_len, + const size_t payload_qws_total, + const size_t xfer_bytes_tail, + void *desc, + const union fi_opx_addr *addr, + uint64_t tag, + void *context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + uint64_t tx_op_flags, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV 16B, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + &opx_ep->tx->shm, addr->hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, + opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) return rc; + +#ifdef OPX_HMEM + /* Note: This code is duplicated in the internode and intranode paths at points in the code where we know we'll be able to proceed with the send, so that we don't waste cycles doing this, only to EAGAIN because we couldn't get a SHM packet or credits/replay/psn */ - size_t hmem_niov = 1; - struct iovec hmem_iov; + size_t hmem_niov = 1; + struct iovec hmem_iov; - /* If the IOVs are GPU-resident, copy all their data to the HMEM + /* If the IOVs are GPU-resident, copy all their data to the HMEM bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ - if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; - unsigned iov_total_len = 0; - for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], - iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); - iov_total_len += iov[i].iov_len; - } - - hmem_iov.iov_base = opx_ep->hmem_copy_buf; - hmem_iov.iov_len = iov_total_len; - iov_ptr = &hmem_iov; - niov_ptr = &hmem_niov; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.eager_noncontig); - } -#endif - hdr->qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - hdr->qw[2] = opx_ep->tx->send.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); - - /* Fill QW 5 from the iovec */ - uint8_t *buf = (uint8_t *)&hdr->qw[5]; - ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; - - if (xfer_bytes_tail) { - ssize_t tail_len = xfer_bytes_tail; - remain = total_len - tail_len; - while (false == - fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - buf, /* In: target buffer to fill */ - &tail_len, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ - // copy until done; - } - assert(tail_len == 0); + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, + &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; } - hdr->qw[6] = tag; - union fi_opx_hfi1_packet_payload *const payload = - (union fi_opx_hfi1_packet_payload *)(hdr + 1); + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + hdr->qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20); + hdr->qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); + + /* Fill QW 6 from the iovec */ + uint8_t *buf = (uint8_t *)&hdr->qw_16B[6]; + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; - buf = payload->byte; + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; while (false == fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - buf, /* In: target buffer to fill */ - &remain, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ // copy until done; } - assert(remain == 0); - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); - fi_opx_shm_poll_many(&opx_ep->ep_fid, 0); + assert(tail_len == 0); + } + hdr->qw_16B[7] = tag; - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); - } + union fi_opx_hfi1_packet_payload *const payload = + (union fi_opx_hfi1_packet_payload *)(hdr + 1); - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); - FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + buf = payload->byte; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(remain == 0); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV 16B, SHM -- EAGER (end)\n"); + return rc; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, + size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + const size_t xfer_bytes_tail = total_len & 0x07ul; + const size_t payload_qws_total = total_len >> 3; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + /* 16B PBC is dws */ + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + + /* PIO is everything else */ + 2 + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + 2 ; /* ICRC/tail 1 qws/2 dws */ + + /* Descriptive code above, but for reference most code just has: */ + /* 9 + kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + /* 2; ICRC/tail */ + + + const uint16_t total_credits_needed = (pbc_dws + 15 ) >> 4; /* round up to full blocks */ + + /* 16B LRH is qws */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + return fi_opx_hfi1_tx_sendv_egr_intranode_16B(ep, + iov, niov, + lrh_qws, + lrh_dlid_16B, + bth_rx, + total_len, + payload_qws_total, + xfer_bytes_tail, + desc, + &addr, + tag, + context, + data, + lock_required, + dest_rx, + tx_op_flags, + caps, + do_cq_completion, + iface, + hmem_device, + hfi1_type); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- EAGER (begin)\n"); + "===================================== SENDV 16B, HFI -- EAGER (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-HFI"); // Even though we're using the reliability service to pack this buffer // we still want to make sure it will have enough credits available to send // and allow the user to poll and quiesce the fabric some union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { return -FI_ENOBUFS; @@ -1068,7 +1862,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz int64_t psn; psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1101,17 +1895,22 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz #endif ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; - replay->scb.qw0 = opx_ep->tx->send.qw0 | - OPX_PBC_LEN(pbc_dws) | - OPX_PBC_CR(opx_ep->tx->force_credit_return) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); - replay->scb.hdr.qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - replay->scb.hdr.qw[2] = opx_ep->tx->send.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); - replay->scb.hdr.qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); + OPX_NO_9B_SUPPORT(hfi1_type); + + replay->scb.scb_16B.qw0 = opx_ep->tx->send_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid; //OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_16B, hfi1_type); + + replay->scb.scb_16B.hdr.qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); + replay->scb.scb_16B.hdr.qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] |((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + replay->scb.scb_16B.hdr.qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + replay->scb.scb_16B.hdr.qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3] | psn; + replay->scb.scb_16B.hdr.qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + replay->scb.scb_16B.hdr.qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; @@ -1119,7 +1918,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ - &replay->scb.hdr.qw[5], /* In: target buffer to fill */ + &replay->scb.scb_16B.hdr.qw_16B[6], /* In: target buffer to fill */ &tail_len, /* In/Out: buffer length to fill */ &iov_idx, /* In/Out: start index, returns end */ &iov_base_offset)) { /* In/Out: start offset, returns offset */ @@ -1127,7 +1926,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } assert(tail_len == 0); } - replay->scb.hdr.qw[6] = tag; + replay->scb.scb_16B.hdr.qw_16B[7] = tag; remain = total_len - xfer_bytes_tail; uint64_t *payload = replay->payload; @@ -1143,41 +1942,101 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, addr.uid.lid, addr.reliability_rx, - dest_rx, psn_ptr, replay, reliability); - - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); - } + &opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + ssize_t rc; + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- EAGER (end)\n"); + "===================================== SENDV 16B, HFI -- EAGER (end)\n"); - - return FI_SUCCESS; + return rc; } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, - const void *buf, - size_t len, - void *desc, - fi_addr_t dest_addr, - uint64_t tag, - void * context, - const uint32_t data, - int lock_required, - const uint64_t dest_rx, - const uint64_t caps, +ssize_t fi_opx_hfi1_tx_sendv_egr_select(struct fid_ep *ep, + const struct iovec *iov, size_t niov, size_t total_len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_sendv_egr(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_sendv_egr_16B(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_sendv_egr(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; +} + + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, + const void *buf, + size_t len, + void *desc, + fi_addr_t dest_addr, + uint64_t tag, + void * context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + uint64_t tx_op_flags, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; @@ -1195,14 +2054,14 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (payload_qws_total << 1); - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, SHM -- EAGER (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -1218,36 +2077,144 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.eager); } #endif - hdr->qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + + /* only if is_contiguous */ + if (OFI_LIKELY(len > 7)) { + /* safe to blindly qw-copy the first portion of the source buffer */ + hdr->qw_9B[5] = *((uint64_t *)buf); + } else { + hdr->qw_9B[5] = 0; + memcpy((void*)&hdr->qw_9B[5], buf, xfer_bytes_tail); + } + + hdr->qw_9B[6] = tag; + + union fi_opx_hfi1_packet_payload * const payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); + + memcpy((void*)payload->byte, + (const void *)((uintptr_t)buf + xfer_bytes_tail), + payload_qws_total * sizeof(uint64_t)); + + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + + if (do_cq_completion) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, SHM -- EAGER (end)\n"); + + return rc; + +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, + const void *buf, + size_t len, + void *desc, + fi_addr_t dest_addr, + uint64_t tag, + void * context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + uint64_t tx_op_flags, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device) +{ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = htons(dest_addr >> 40); + + const size_t xfer_bytes_tail = len & 0x07ul; + const size_t payload_qws_total = len >> 3; - hdr->qw[2] = opx_ep->tx->send.hdr.qw[2]; + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + ((payload_qws_total) << 1) + + 2; /* ICRC/tail */ - hdr->qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ - hdr->qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, + opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-EAGER-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- EAGER (end) - No packet available.\n"); + return rc; + } + +#ifdef OPX_HMEM + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager); + } +#endif + + hdr->qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); + hdr->qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); + hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); /* only if is_contiguous */ if (OFI_LIKELY(len > 7)) { /* safe to blindly qw-copy the first portion of the source buffer */ - hdr->qw[5] = *((uint64_t *)buf); + hdr->qw_16B[6] = *((uint64_t *)buf); } else { - hdr->qw[5] = 0; - memcpy((void*)&hdr->qw[5], buf, xfer_bytes_tail); + hdr->qw_16B[6] = 0; + memcpy((void*)&hdr->qw_16B[6], buf, xfer_bytes_tail); } - hdr->qw[6] = tag; + hdr->qw_16B[7] = tag; union fi_opx_hfi1_packet_payload * const payload = (union fi_opx_hfi1_packet_payload *)(hdr+1); @@ -1260,25 +2227,26 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); if (do_cq_completion) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, - tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- EAGER (end)\n"); + "===================================== SEND 16B, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, - const uint16_t lrh_dws, + const uint16_t lrh_packet_length, /* 9B dws, 16B qws and little/big-endian as required */ const uint64_t pbc_dlid, const uint64_t pbc_dws, const ssize_t len, @@ -1287,7 +2255,9 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, const uint32_t psn, const uint32_t data, const uint64_t tag, - const uint64_t caps) + uint64_t tx_op_flags, + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -1298,46 +2268,130 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, /* only if is_contiguous */ if (OFI_LIKELY(len > 7)) { - /* safe to blindly qw-copy the first portion of the source buffer */ - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48), - *((uint64_t *)buf), tag); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + /* safe to blindly qw-copy the first portion of the source buffer */ + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_packet_length << 32), + + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), + + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48), + *((uint64_t *)buf), tag); + + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_packet_length << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), + + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48), + *((uint64_t *)buf)); + } } else { - fi_opx_set_scb_special2(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw_9B(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_packet_length << 32), + + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), + + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48), + buf, xfer_bytes_tail, tag); + + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw_16B(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_packet_length << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48), + buf, + xfer_bytes_tail); - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), - - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48), - buf, xfer_bytes_tail, tag); + } } FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); return 1; // Consumed 1 credit + +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep, + union fi_opx_hfi1_pio_state *pio_state, + uint64_t *local_storage, + uint64_t *buf_qws, + const size_t hdr_and_payload_qws, + const uint64_t tag) +{ + assert(pio_state->credits_total - pio_state->scb_head_index); + assert(hdr_and_payload_qws <= 8); + + union fi_opx_hfi1_pio_state pio_local = *pio_state; + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header + local_storage[8] = tag; /* todo: pretty sure it's already there */ + + int i; + + for (i = 1; i < hdr_and_payload_qws ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], buf_qws[i-1]); + local_storage[8 + i] = buf_qws[i-1]; + } + if (hdr_and_payload_qws < 8) { /* less than a full block stored? pad it out */ + for (; i<8 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); + local_storage[8 + i] = OPX_JKR_16B_PAD_QWORD; + } + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_CREDITS(pio_local, 1); + pio_state->qw0 = pio_local.qw0; + return 1; + } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, +ssize_t fi_opx_hfi1_tx_egr_store_full_payload_blocks(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, uint64_t *buf_qws, uint16_t full_block_credits_needed, @@ -1369,8 +2423,9 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[5], buf_qws[5]); OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } FI_OPX_HFI1_CONSUME_CREDITS(pio_local, contiguous_full_blocks_to_write); @@ -1396,8 +2451,9 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[5], buf_qws[5]); OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } FI_OPX_HFI1_CONSUME_CREDITS(pio_local, full_block_credits_needed); @@ -1410,20 +2466,25 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_egr_write_payload_tail(struct fi_opx_ep *opx_ep, +ssize_t fi_opx_hfi1_tx_egr_store_payload_tail(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, uint64_t *buf_qws, const size_t payload_qws_tail) { volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, *pio_state); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "buf_qws %p, payload_qws_tail %zu\n", + buf_qws, payload_qws_tail); + unsigned i = 0; for (; iscb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_source); + else + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_source); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); uint64_t * payload = replay->payload; @@ -1457,9 +2516,9 @@ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, payload[i] = buf_qws[i]; } - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.uid.lid, + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, addr.hfi1_rx, psn_ptr, replay, - reliability); + reliability, hfi1_type); } __OPX_FORCE_INLINE__ @@ -1473,35 +2532,45 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; + OPX_NO_16B_SUPPORT(hfi1_type); + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { return fi_opx_hfi1_tx_send_egr_intranode(ep, buf, len, desc, dest_addr, - tag, context, data, lock_required, dest_rx, caps, do_cq_completion, + tag, context, data, lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, iface, hmem_device); } const size_t xfer_bytes_tail = len & 0x07ul; const size_t payload_qws_total = len >> 3; + const size_t payload_qws_tail = payload_qws_total & 0x07ul; - uint16_t full_block_credits_needed = (uint16_t)(payload_qws_total >> 3); + uint16_t full_block_credits_needed = (uint16_t)(payload_qws_total >> 3); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + assert(hfi1_type != OPX_HFI1_JKR); + /* 9B PBC is dws */ const uint64_t pbc_dws = - 2 + /* pbc */ - 2 + /* lhr */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_qws_total << 1); + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 2 + /* lhr */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + /* PIO is everything else */ + (payload_qws_total << 1); /* one packet payload */ + + /* 9B LRH is dws */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ assert(lock_required == 0); @@ -1513,11 +2582,10 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - - const uint16_t total_credits_needed = - 1 + /* packet header */ - full_block_credits_needed + /* full payload blocks */ - (payload_qws_tail > 0); /* partial payload block */ + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- payload */ + (payload_qws_tail > 0); /* PIO partial block -- 1 credit */ ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { @@ -1529,7 +2597,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1538,7 +2606,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] @@ -1546,13 +2614,13 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, } #endif - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, + fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_dws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, - payload_qws_total, psn, data, tag, caps); + payload_qws_total, psn, data, tag, tx_op_flags, caps, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); @@ -1560,7 +2628,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, full_block_credits_needed, total_credits_available - 1); } @@ -1569,7 +2637,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_payload_tail(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), payload_qws_tail); } @@ -1584,18 +2652,252 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - xfer_bytes_tail, tmp, buf, payload_qws_total, reliability); + xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); + + ssize_t rc; + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- EAGER (end)\n"); + + return rc; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void* context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + + OPX_NO_9B_SUPPORT(hfi1_type); + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + return fi_opx_hfi1_tx_send_egr_intranode_16B(ep, buf, len, desc, dest_addr, + tag, context, data, lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, + iface, hmem_device); + } + + const size_t xfer_bytes_tail = len & 0x07ul; + const size_t payload_qws_total = len >> 3; + + /* 16B (RcvPktCtrl=9) has 1 QW of KDETH and 1 QW of tail in PIO (non-SOP) */ + const size_t kdeth9_qws_total = 1; + const size_t tail_qws_total = 1; + + + /* Full 64 byte/8 qword blocks -- 1 credit per block */ + uint16_t full_block_credits_needed = (uint16_t)((kdeth9_qws_total + payload_qws_total + tail_qws_total) >> 3); + /* Remaining tail qwords (< 8) after full blocks */ + size_t tail_partial_block_qws = (kdeth9_qws_total + payload_qws_total + tail_qws_total) & 0x07ul; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + + assert(hfi1_type & OPX_HFI1_JKR); + /* 16B PBC is dws */ + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + + /* PIO is everything else */ + (kdeth9_qws_total << 1) + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ + + /* 16B LRH is qws */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-HFI"); + + /* first check for sufficient credits to inject the entire packet */ + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- kdeth9/payload/tail */ + (tail_partial_block_qws > 0); /* PIO partial block -- 1 credit */ + + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + +#ifdef OPX_HMEM + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager); + } +#endif + + uint64_t local_temp[16] = {0}; +#ifndef NDEBUG + unsigned credits_consumed = +#endif + fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_qws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, + payload_qws_total, psn, data, tag, tx_op_flags, caps, hfi1_type); + + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + + assert(hfi1_type & OPX_HFI1_JKR); + + /* write one block of PIO non-SOP, either one full block (8 qws) or the partial qws/block */ + const size_t first_block_qws = full_block_credits_needed ? 8 : tail_partial_block_qws ; + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(opx_ep, &pio_state, local_temp, buf_qws, + first_block_qws, tag); + + buf_qws = buf_qws + first_block_qws - 1 /* qws of payload, not the kdeth qword */; + /* adjust full or partial for what we just consumed */ + if (full_block_credits_needed) full_block_credits_needed--; + /* we wrote 7 qw, counts as partial tail*/ + else tail_partial_block_qws = 0; + + + if (OFI_LIKELY(full_block_credits_needed)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, full_block_credits_needed, + total_credits_available - 2); + } + + if (OFI_LIKELY(tail_partial_block_qws)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws + (full_block_credits_needed << 3), + tail_partial_block_qws - 1);// (tail_partial_block_qws-1) data + 1 QW ICRC + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); + ssize_t rc; if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, - lock_required, tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- EAGER (end)\n"); + "===================================== SEND 16B, HFI -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_select(struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void* context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_send_egr(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_send_egr_16B(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_send_egr(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; } /* @@ -1605,7 +2907,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1616,7 +2918,9 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ const uint32_t psn, const uint32_t data, const uint64_t tag, - const uint64_t caps) + uint64_t tx_op_flags, + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -1627,18 +2931,39 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ /* For a multi-packet eager, the *first* packet's payload length should always be > 15 bytes, so we should be safe to blindly copy 2 qws out of buf */ - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST), - opx_ep->tx->send.hdr.qw[2] | (payload_bytes_total << 32) | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - *((uint64_t *)buf), - *((uint64_t *)buf + 1), tag); + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)), + opx_ep->tx->send_9B.hdr.qw_9B[2] | (payload_bytes_total << 32) | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + *((uint64_t *)buf), + *((uint64_t *)buf + 1), tag); + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)), + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn | (payload_bytes_total << 32), + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + *((uint64_t *)buf), + *((uint64_t *)buf + 1)); + } + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -1646,6 +2971,44 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ return 1; /* Consumed 1 credit */ } +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(struct fi_opx_ep *opx_ep, + union fi_opx_hfi1_pio_state *pio_state, + uint64_t *local_storage, + const uint64_t tag, + const size_t payload_after_header_qws, + uint64_t *buf_qws) +{ + union fi_opx_hfi1_pio_state pio_local = *pio_state; + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); + assert(payload_after_header_qws <=7); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header + local_storage[8] = tag; + + int i = 1; /* start past the hdr qword */ + + /* store remaing buffer */ + for (; i <= payload_after_header_qws ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], buf_qws[i-1]); + local_storage[8 + i] = buf_qws[i-1]; + } + /* store padding if needed */ + for (; i <= 7 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); + local_storage[8 + i] = OPX_JKR_16B_PAD_QWORD; + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_CREDITS(pio_local, 1); + pio_state->qw0 = pio_local.qw0; + return 1; + +} + /* * Write the nth packet header of a multi-packet eager send where the remaining payload data is * more than 16 bytes. This means we'll use all 16 bytes of tail space in the packet header, and @@ -1654,7 +3017,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1664,20 +3027,40 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, const ssize_t xfer_bytes_tail, const uint32_t payload_offset, const uint32_t psn, - const uint32_t mp_egr_uid) + const uint32_t mp_egr_uid, + const enum opx_hfi1_type hfi1_type) { volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, *pio_state); - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3], - *((uint64_t *)buf), - *((uint64_t *)buf + 1), - (((uint64_t) mp_egr_uid) << 32) | payload_offset); + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3], + *((uint64_t *)buf), + *((uint64_t *)buf + 1), + (((uint64_t) mp_egr_uid) << 32) | payload_offset); + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4], + *((uint64_t *)buf), + *((uint64_t *)buf + 1)); + } FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -1692,7 +3075,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1702,44 +3085,80 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(struct fi_opx_e const ssize_t xfer_bytes_tail, const uint32_t payload_offset, const uint32_t psn, - const uint32_t mp_egr_uid) + const uint32_t mp_egr_uid, + const enum opx_hfi1_type hfi1_type) { volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, *pio_state); - fi_opx_set_scb_special(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3], - buf, xfer_bytes_tail, - (((uint64_t) mp_egr_uid) << 32) | payload_offset); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_inject_and_copy_scb_9B(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3], + buf, xfer_bytes_tail, + (((uint64_t) mp_egr_uid) << 32) | payload_offset); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + return 1; /* Consumed 1 credit */ + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + // 1st cacheline + fi_opx_store_inject_and_copy_scb_16B(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4], + buf, xfer_bytes_tail); - return 1; /* Consumed 1 credit */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, *pio_state); + + fi_opx_store_inject_and_copy_scb2_16B(scb2, local_storage, (((uint64_t) mp_egr_uid) << 32) | payload_offset ); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + + opx_ep->tx->pio_state->qw0 = pio_state->qw0; + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + return 2; /* Consumed 2 credit */ + } } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, - void **buf, - const uint64_t payload_bytes_total, - const void *desc, - uint8_t *hmem_bounce_buf, - const uint64_t pbc_dlid, - const uint64_t bth_rx, - const uint64_t lrh_dlid, - const union fi_opx_addr addr, - uint64_t tag, - const uint32_t data, - int lock_required, - const uint64_t caps, - const enum ofi_reliability_kind reliability, - uint32_t *psn_out, - const enum fi_hmem_iface iface, - const uint64_t hmem_device) +ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, + void **buf, + const uint64_t payload_bytes_total, + const void *desc, + uint8_t *hmem_bounce_buf, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + uint64_t tag, + const uint32_t data, + int lock_required, + uint64_t tx_op_flags, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + uint32_t *psn_out, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1748,6 +3167,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-FIRST-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); if (OFI_UNLIKELY(total_credits_available < 0)) { return -FI_ENOBUFS; @@ -1759,12 +3179,12 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } - *psn_out = psn; /* This will be the UID used in the remaining packets */ + *psn_out = psn; /* This will be the UID used in the remaining packets */ #ifdef OPX_HMEM /* If the source buf resides in GPU memory, copy the entire payload to @@ -1774,40 +3194,76 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, hmem_bounce_buf, - *buf, payload_bytes_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + *buf, payload_bytes_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); *buf = hmem_bounce_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[FI_OPX_KIND_TAG] - .send.mp_eager); + .kind[FI_OPX_KIND_TAG] + .send.mp_eager); } #endif void *buf_ptr = *buf; - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; + + const uint16_t lrh_dws = (hfi1_type & OPX_HFI1_JKR) ? (FI_OPX_MP_EGR_CHUNK_DWS - 2) >> 1 : htons(FI_OPX_MP_EGR_CHUNK_DWS - 1); #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(opx_ep, &pio_state, tmp, buf_ptr, bth_rx, lrh_dlid, - htons(FI_OPX_MP_EGR_CHUNK_DWS - 1), - pbc_dlid, - FI_OPX_MP_EGR_CHUNK_DWS, - payload_bytes_total, - psn, - data, - tag, - caps); + fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(opx_ep, &pio_state, local_temp, buf_ptr, bth_rx, lrh_dlid, + lrh_dws, + pbc_dlid, + FI_OPX_MP_EGR_CHUNK_DWS, + payload_bytes_total, + psn, + data, + tag, + tx_op_flags, + caps, + hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf_ptr + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); + if (hfi1_type & OPX_HFI1_JKR) { + /* write header and payload */ + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, tag, + 7 /* qws of payload */, buf_qws); + + buf_qws += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; + + uint32_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; // the last block needs to include icrc, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, - buf_qws, - FI_OPX_MP_EGR_CHUNK_CREDITS - 1, - total_credits_available - 1); + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + buf_qws = buf_qws + (full_block_credits_needed << 3); + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws, + 7 ); // 7 QW data + 1 QW ICRC + } else { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + FI_OPX_MP_EGR_CHUNK_CREDITS - 1, + total_credits_available - 1); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } + #ifndef NDEBUG assert(credits_consumed == FI_OPX_MP_EGR_CHUNK_CREDITS); #endif @@ -1819,9 +3275,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, tmp, buf_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS, reliability); - + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf_ptr, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-FIRST-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, HFI -- MULTI-PACKET EAGER FIRST (end)\n"); @@ -1839,7 +3294,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, const uint64_t lrh_dlid, const union fi_opx_addr addr, int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1848,6 +3304,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); if (OFI_UNLIKELY(total_credits_available < 0)) { OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-HFI"); @@ -1859,23 +3316,24 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, htons(FI_OPX_MP_EGR_CHUNK_DWS - 1), pbc_dlid, FI_OPX_MP_EGR_CHUNK_DWS, FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, payload_offset, psn, - mp_egr_uid); + mp_egr_uid, + hfi1_type); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); @@ -1883,7 +3341,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, FI_OPX_MP_EGR_CHUNK_CREDITS - 1, total_credits_available - 1); @@ -1898,8 +3356,108 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, tmp, buf, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS, reliability); + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- MULTI-PACKET EAGER NTH (end)\n"); + + return FI_SUCCESS; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, + const void *buf, + const uint32_t payload_offset, + const uint32_t mp_egr_uid, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER NTH (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-HFI"); + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); + if (OFI_UNLIKELY(total_credits_available < 0)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-HFI"); + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + + uint64_t local_temp[16] = {0}; +#ifndef NDEBUG + unsigned credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + (FI_OPX_MP_EGR_CHUNK_DWS - 2) >> 1, + pbc_dlid, + FI_OPX_MP_EGR_CHUNK_DWS, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, + payload_offset, + psn, + mp_egr_uid, + hfi1_type); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); + + /* header and payload */ +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, + (((uint64_t) mp_egr_uid) << 32) | payload_offset, + 7 /* qws of payload */, buf_qws); + buf_qws += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; + + uint16_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + buf_qws = buf_qws + (full_block_credits_needed << 3); + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws, + 7 ); // 7 QW data + 1 QW ICRC + +#ifndef NDEBUG + assert(credits_consumed == FI_OPX_MP_EGR_CHUNK_CREDITS); +#endif + + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1919,7 +3477,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, const uint64_t lrh_dlid, const union fi_opx_addr addr, int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1951,9 +3510,10 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, const uint64_t pbc_dws = 16 + /* pbc + packet header */ (payload_qws_total << 1); - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-LAST"); @@ -1965,13 +3525,13 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-MP-EAGER-NTH-LAST"); return -FI_EAGAIN; } - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed; @@ -1981,16 +3541,16 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, tmp, buf, bth_rx, + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_dws, pbc_dlid, pbc_dws, len, payload_offset, - psn, mp_egr_uid); + psn, mp_egr_uid, hfi1_type); } else { #ifndef NDEBUG credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, - lrh_dws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid); + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_dws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); @@ -1998,7 +3558,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, full_block_credits_needed, total_credits_available - 1); @@ -2008,7 +3568,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_payload_tail(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), payload_qws_tail); } @@ -2024,8 +3584,170 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - xfer_bytes_tail, tmp, buf, - payload_qws_total, reliability); + xfer_bytes_tail, local_temp, buf, + payload_qws_total, reliability, hfi1_type); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-LAST"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- MULTI-PACKET EAGER LAST (end)\n"); + + return FI_SUCCESS; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, + const void *buf, + const uint32_t payload_offset, + const ssize_t len, + const uint32_t mp_egr_uid, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER LAST (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-LAST"); + + size_t xfer_bytes_tail; + if (len <= FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL) { + xfer_bytes_tail = len; + } else if (!(len & 0x07ul)) { + /* Length is a multiple of 8 bytes and must be at least 24. + We can store 16 bytes of that in tail */ + xfer_bytes_tail = FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL; + } else { + /* Length is not a multiple of 8 bytes, and it's greater than 16. + We can store 8 + n bytes in tail (where n == len % 8) */ + xfer_bytes_tail = 8 + (len & 0x07ul); + } + + const size_t payload_qws_total = (len - xfer_bytes_tail) >> 3; + /* 16B (RcvPktCtrl=9) has 1 QW of KDETH and 1 QW of tail in PIO (non-SOP) */ + const size_t kdeth9_qws_total = 1; + const size_t tail_qws_total = 1; + + /* Full 64 byte/8 qword blocks -- 1 credit per block */ + uint16_t full_block_credits_needed = (uint16_t)((kdeth9_qws_total + payload_qws_total + tail_qws_total) >> 3); + /* Remaining tail qwords (< 8) after full blocks */ + size_t tail_partial_block_qws = (kdeth9_qws_total + payload_qws_total + tail_qws_total) & 0x07ul; + + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + /* PIO is everything else */ + (kdeth9_qws_total << 1) + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ + + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- kdeth9/payload/tail */ + (tail_partial_block_qws > 0); /* PIO partial block -- 1 credit */ + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-LAST"); + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-MP-EAGER-NTH-LAST"); + return -FI_EAGAIN; + } + + uint64_t local_temp[16] = {0}; + +#ifndef NDEBUG + unsigned credits_consumed; +#endif + + if (OFI_UNLIKELY(len <= FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL)) { +#ifndef NDEBUG + credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, local_temp, buf, bth_rx, + lrh_dlid, lrh_qws, pbc_dlid, pbc_dws, len, payload_offset, + psn, mp_egr_uid, hfi1_type); + } else { +#ifndef NDEBUG + credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_qws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid, hfi1_type); + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + + /* write 7 qwords of payload data or the partial tail qws/block minus hdr/kdeth minus tail (not in buffer) */ + const size_t payload_after_hdr_qws = full_block_credits_needed ? + OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS : + tail_partial_block_qws - kdeth9_qws_total - tail_qws_total ; + + /* header and payload */ +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, + (((uint64_t) mp_egr_uid) << 32) | payload_offset, + payload_after_hdr_qws, buf_qws); + + buf_qws += payload_after_hdr_qws /* qws of payload, not the kdeth qword */; + + /* adjust full or partial for what we just consumed */ + if (full_block_credits_needed) full_block_credits_needed--; + /* we wrote 7 qw, counts as partial tail*/ + else tail_partial_block_qws = 0; + + if (OFI_LIKELY(full_block_credits_needed)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); + } + + if (OFI_LIKELY(tail_partial_block_qws)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws + (full_block_credits_needed << 3), + tail_partial_block_qws - 1);// (tail_partial_block_qws-1) data + 1 QW ICRC + } + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + xfer_bytes_tail, local_temp, buf, + payload_qws_total, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-LAST"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -2034,66 +3756,136 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, return FI_SUCCESS; } + static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, const uint8_t dest_hfi_unit, const uint64_t dest_rx, const uint64_t lrh_dlid, struct fi_opx_completion_counter *cc, const uint64_t bytes_to_sync, - const uint32_t dest_extended_rx) + const uint32_t dest_extended_rx, + enum opx_hfi1_type hfi1_type) { - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (0 << 4); - const uint16_t lrh_dws = htons(pbc_dws - 1); const uint64_t bth_rx = dest_rx << 56; uint64_t pos; ssize_t rc; /* DAOS support - rank_inst field has been depricated and will be phased out. * The value is always zero. */ - union fi_opx_hfi1_packet_hdr * tx_hdr = opx_shm_tx_next( + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next( &opx_ep->tx->shm, dest_hfi_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, dest_extended_rx, 0, &rc); /* Potential infinite loop, unable to return result to application */ - while(OFI_UNLIKELY(tx_hdr == NULL)) { //TODO: Verify that all callers of this function can tolderate a NULL rc - fi_opx_shm_poll_many(&opx_ep->ep_fid, FI_OPX_LOCK_NOT_REQUIRED); - tx_hdr = opx_shm_tx_next( + while(OFI_UNLIKELY(hdr == NULL)) { //TODO: Verify that all callers of this function can tolderate a NULL rc + fi_opx_shm_poll_many(&opx_ep->ep_fid, FI_OPX_LOCK_NOT_REQUIRED, OPX_HFI1_TYPE); + hdr = opx_shm_tx_next( &opx_ep->tx->shm, dest_hfi_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, dest_extended_rx, 0, &rc); } - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); - tx_hdr->qw[5] = (uintptr_t)cc; - tx_hdr->qw[6] = bytes_to_sync; - - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint64_t pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (0 << 4); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_9B[5] = (uintptr_t)cc; + hdr->qw_9B[6] = bytes_to_sync; + } else { + const uint64_t pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; /* ICRC/tail */ + const uint16_t lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_16B[6] = (uintptr_t)cc; + hdr->qw_16B[7] = bytes_to_sync; + } + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); } ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device); + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); + +ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device); + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_send_rzv_16B(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; +} #endif /* _FI_PROV_OPX_HFI1_TRANSPORT_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h index 1d87a9244d4..e3b2fd8503d 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h @@ -42,209 +42,108 @@ // RHF changes // Common to both JKR/WFR -#define OPX_RHF_RCV_TYPE_EXPECTED_RCV(_rhf) ((_rhf & 0x00007000ul) == 0x00000000ul) -#define OPX_RHF_RCV_TYPE_EAGER_RCV(_rhf) ((_rhf & 0x00001000ul) == 0x00001000ul) -#define OPX_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) +#define OPX_RHF_RCV_TYPE_EXPECTED_RCV(_rhf, _noop) ((_rhf & 0x00007000ul) == 0x00000000ul) +#define OPX_RHF_RCV_TYPE_EAGER_RCV(_rhf, _noop) ((_rhf & 0x00001000ul) == 0x00001000ul) +#define OPX_RHF_RCV_TYPE_OTHER(_rhf, _noop) ((_rhf & 0x00006000ul) != 0x00000000ul) -#define OPX_PBC_CR(cr) ((cr & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT) -#define OPX_PBC_LEN(len) (len) -#define OPX_PBC_VL(vl) ((vl & FI_OPX_HFI1_PBC_VL_MASK) << FI_OPX_HFI1_PBC_VL_SHIFT) +#define OPX_PBC_CR(cr, _noop) ((cr & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT) +#define OPX_PBC_LEN(len, _noop) (len) +#define OPX_PBC_VL(vl, _noop) ((vl & FI_OPX_HFI1_PBC_VL_MASK) << FI_OPX_HFI1_PBC_VL_SHIFT) /* Note: double check JKR sc bits */ -#define OPX_PBC_SC(sc) (((sc >> FI_OPX_HFI1_PBC_SC4_SHIFT) & FI_OPX_HFI1_PBC_SC4_MASK) << FI_OPX_HFI1_PBC_DCINFO_SHIFT) +#define OPX_PBC_SC(sc, _noop) (((sc >> FI_OPX_HFI1_PBC_SC4_SHIFT) & FI_OPX_HFI1_PBC_SC4_MASK) << FI_OPX_HFI1_PBC_DCINFO_SHIFT) /* PBC most significant bits shift (32 bits) defines */ -#define OPX_PBC_MSB_SHIFT 32 +#define OPX_MSB_SHIFT 32 -#if (defined(OPX_WFR) && !defined(OPX_JKR)) /***************************************************************/ -/* WFR Build specific definitions */ +/* Both JKR and WFR runtime is now supported (no longer doing */ +/* build-time constants) */ +/* */ +/* Runtime support relies on a local variable "hfi1_type", */ +/* which is likely passed down through macro and function */ +/* constants which are selected/optimized inline with */ +/* function tables. */ /***************************************************************/ - #define OPX_PBC_DLID OPX_PBC_WFR_DLID - #define OPX_PBC_SCTXT OPX_PBC_WFR_SCTXT - #define OPX_PBC_L2COMPRESSED OPX_PBC_WFR_L2COMPRESSED - #define OPX_PBC_PORTIDX OPX_PBC_WFR_PORTIDX - #define OPX_PBC_L2TYPE OPX_PBC_WFR_L2TYPE - #define OPX_PBC_RUNTIME OPX_PBC_WFR_RUNTIME - #define OPX_PBC_LRH_DLID_TO_PBC_DLID OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID +#define OPX_PBC_DLID(dlid, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_DLID(dlid) : OPX_PBC_JKR_DLID(dlid)) -#elif (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ - - #define OPX_PBC_DLID OPX_PBC_JKR_DLID - #define OPX_PBC_SCTXT OPX_PBC_JKR_SCTXT - #define OPX_PBC_L2COMPRESSED OPX_PBC_JKR_L2COMPRESSED - #define OPX_PBC_PORTIDX OPX_PBC_JKR_PORTIDX - #define OPX_PBC_L2TYPE OPX_PBC_JKR_L2TYPE - #define OPX_PBC_RUNTIME OPX_PBC_JKR_RUNTIME - #define OPX_PBC_LRH_DLID_TO_PBC_DLID OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID +#define OPX_PBC_SCTXT(ctx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_SCTXT(ctx) : OPX_PBC_JKR_SCTXT(ctx)) -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/***************************************************************/ +#define OPX_PBC_L2COMPRESSED(c, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_L2COMPRESSED(c) : OPX_PBC_JKR_L2COMPRESSED(c)) - #define OPX_PBC_DLID(dlid) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_DLID(dlid) : OPX_PBC_WFR_DLID(dlid)) +#define OPX_PBC_PORTIDX(pidx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_PORTIDX(pidx) : OPX_PBC_JKR_PORTIDX(pidx)) - #define OPX_PBC_SCTXT(ctx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_SCTXT(ctx) : OPX_PBC_WFR_SCTXT(ctx)) +#define OPX_PBC_LRH_DLID_TO_PBC_DLID(dlid, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ +OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID(dlid) : OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(dlid)) - #define OPX_PBC_L2COMPRESSED(c) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_L2COMPRESSED(c) : OPX_PBC_WFR_L2COMPRESSED(c)) - #define OPX_PBC_PORTIDX(pidx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_PORTIDX(pidx) : OPX_PBC_WFR_PORTIDX(pidx)) +#define OPX_PBC_L2TYPE(type, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_L2TYPE(type) : OPX_PBC_JKR_L2TYPE(type)) - #define OPX_PBC_LRH_DLID_TO_PBC_DLID(dlid) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(dlid) : OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID(dlid)) - - -/* Mixed WFR/JKR header support must be 9B */ -#ifndef NDEBUG - - __OPX_FORCE_INLINE__ - uint32_t opx_pbc_l2type(unsigned type) - { - assert(type == OPX_PBC_JKR_L2TYPE_9B); - return ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? - OPX_PBC_JKR_L2TYPE(type) : OPX_PBC_WFR_L2TYPE(type)); - } - #define OPX_PBC_L2TYPE(type) opx_pbc_l2type(type) -#else - - #define OPX_PBC_L2TYPE(type) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) : \ - OPX_PBC_WFR_L2TYPE(OPX_PBC_JKR_L2TYPE_9B)) /* OPX_PBC_WFR_UNUSED */ -#endif +/* One runtime check for mutiple fields - DLID, PORT, L2TYPE */ +#define OPX_PBC_RUNTIME(_dlid, _pidx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + (OPX_PBC_WFR_DLID(_dlid) | OPX_PBC_WFR_PORTIDX(_pidx)) : \ + (OPX_PBC_JKR_DLID(_dlid) | OPX_PBC_JKR_PORTIDX(_pidx))) - /* One runtime check for mutiple fields - DLID, PORT, L2TYPE */ - #define OPX_PBC_RUNTIME(dlid, pidx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - (OPX_PBC_JKR_DLID(dlid) | OPX_PBC_JKR_PORTIDX(pidx)) : \ - (OPX_PBC_WFR_DLID(dlid) | OPX_PBC_WFR_PORTIDX(pidx)) ) -#else /* ERROR */ - #warning Should not happen Not WFR and Not JKR - #error "NOT WFR AND NOT JKR" -#endif - #define OPX_BTH_UNUSED 0 // Default unsupported values to 0 -#if (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ - -#define OPX_BTH_CSPEC(_cspec) OPX_BTH_JKR_CSPEC(_cspec) -#define OPX_BTH_RC2(_rc2) OPX_BTH_JKR_RC2(_rc2) -#define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED // Cspec is not used in 9B header -#define OPX_BTH_RC2_VAL OPX_BTH_JKR_RC2_VAL - -#elif (defined(OPX_WFR) && !defined(OPX_JKR)) -/***************************************************************/ -/* WKR Build specific definitions */ -/***************************************************************/ - -#define OPX_BTH_RC2(_rc2) OPX_BTH_UNUSED -#define OPX_BTH_CSPEC(_cspec) OPX_BTH_UNUSED -#define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED -#define OPX_BTH_RC2_VAL OPX_BTH_UNUSED - -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/***************************************************************/ - -#define OPX_BTH_RC2(_rc2) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_RC2(_rc2, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_RC2(_rc2) : OPX_BTH_UNUSED) -#define OPX_BTH_CSPEC(_cspec) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_CSPEC(_cspec, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_CSPEC(_cspec) : OPX_BTH_UNUSED) #define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED // Cspec is not used in 9B header -#define OPX_BTH_RC2_VAL ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_RC2_VAL(_hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_RC2_VAL : OPX_BTH_UNUSED) -#endif +#define OPX_RHF_SEQ_NOT_MATCH(_seq, _rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf) : OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf)) -#if (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH OPX_JKR_RHF_SEQ_NOT_MATCH -#define OPX_RHF_SEQ_INCREMENT OPX_JKR_RHF_SEQ_INCREMENT -#define OPX_IS_ERRORED_RHF OPX_JKR_IS_ERRORED_RHF -#define OPX_RHF_SEQ_MATCH OPX_JKR_RHF_SEQ_MATCH -#define OPX_RHF_SEQ_INIT_VAL OPX_JKR_RHF_SEQ_INIT_VAL -#define OPX_RHF_IS_USE_EGR_BUF OPX_JKR_RHF_IS_USE_EGR_BUF -#define OPX_RHF_EGR_INDEX OPX_JKR_RHF_EGR_INDEX -#define OPX_RHF_EGR_OFFSET OPX_JKR_RHF_EGR_OFFSET -#define OPX_RHF_HDRQ_OFFSET OPX_JKR_RHF_HDRQ_OFFSET - -#define OPX_RHE_DEBUG OPX_JKR_RHE_DEBUG -#define OPX_RHF_CHECK_HEADER OPX_JKR_RHF_CHECK_HEADER - -#elif (defined(OPX_WFR) && !defined(OPX_JKR)) -/***************************************************************/ -/* WKR Build specific definitions */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH OPX_WFR_RHF_SEQ_NOT_MATCH -#define OPX_RHF_SEQ_INCREMENT OPX_WFR_RHF_SEQ_INCREMENT -#define OPX_IS_ERRORED_RHF OPX_WFR_IS_ERRORED_RHF -#define OPX_RHF_SEQ_MATCH OPX_WFR_RHF_SEQ_MATCH -#define OPX_RHF_SEQ_INIT_VAL OPX_WFR_RHF_SEQ_INIT_VAL -#define OPX_RHF_IS_USE_EGR_BUF OPX_WFR_RHF_IS_USE_EGR_BUF -#define OPX_RHF_EGR_INDEX OPX_WFR_RHF_EGR_INDEX -#define OPX_RHF_EGR_OFFSET OPX_WFR_RHF_EGR_OFFSET -#define OPX_RHF_HDRQ_OFFSET OPX_WFR_RHF_HDRQ_OFFSET - -#define OPX_RHE_DEBUG OPX_WFR_RHE_DEBUG -#define OPX_RHF_CHECK_HEADER OPX_WFR_RHF_CHECK_HEADER - -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/* Constant macro magic will be used later for this */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH(_seq, _rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf) : OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf)) +#define OPX_RHF_SEQ_INCREMENT(_seq, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_INCREMENT(_seq) : OPX_JKR_RHF_SEQ_INCREMENT(_seq)) -#define OPX_RHF_SEQ_INCREMENT(_seq) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_INCREMENT(_seq) : OPX_WFR_RHF_SEQ_INCREMENT(_seq)) +#define OPX_IS_ERRORED_RHF(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_IS_ERRORED_RHF(_rhf, _hfi1_type) : OPX_JKR_IS_ERRORED_RHF(_rhf, _hfi1_type)) -#define OPX_IS_ERRORED_RHF(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_IS_ERRORED_RHF(_rhf) : OPX_WFR_IS_ERRORED_RHF(_rhf)) +#define OPX_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) : OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type)) -#define OPX_RHF_SEQ_MATCH(_seq, _rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf) : OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf)) +/* Init-time, let it use the variable - not optimized */ +#define OPX_RHF_SEQ_INIT_VAL(_hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_INIT_VAL : OPX_JKR_RHF_SEQ_INIT_VAL) -#define OPX_RHF_SEQ_INIT_VAL ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_INIT_VAL : OPX_WFR_RHF_SEQ_INIT_VAL) +#define OPX_RHF_IS_USE_EGR_BUF(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf) : OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf)) -#define OPX_RHF_IS_USE_EGR_BUF(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf) : OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf)) +#define OPX_RHF_EGR_INDEX(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_EGR_INDEX(_rhf) : OPX_JKR_RHF_EGR_INDEX(_rhf)) -#define OPX_RHF_EGR_INDEX(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_EGR_INDEX(_rhf) : OPX_WFR_RHF_EGR_INDEX(_rhf)) +#define OPX_RHF_EGR_OFFSET(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_EGR_OFFSET(_rhf) : OPX_JKR_RHF_EGR_OFFSET(_rhf)) -#define OPX_RHF_EGR_OFFSET(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_EGR_OFFSET(_rhf) : OPX_WFR_RHF_EGR_OFFSET(_rhf)) +#define OPX_RHF_HDRQ_OFFSET(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_HDRQ_OFFSET(_rhf) : OPX_JKR_RHF_HDRQ_OFFSET(_rhf)) -#define OPX_RHF_HDRQ_OFFSET(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_HDRQ_OFFSET(_rhf) : OPX_WFR_RHF_HDRQ_OFFSET(_rhf)) +#define OPX_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) : \ + OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type)) -#define OPX_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) : \ - OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr)) +#define OPX_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type) : OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type)) -#define OPX_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) : OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) -#endif +#define OPX_HEADER_SIZE (8 * 8) // doesn't include PBC. For 9B it includes the unused_pad qw. #endif + + + diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h index df8e6ea13af..086795afc23 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h @@ -52,8 +52,6 @@ __OPX_FORCE_INLINE__ uint32_t opx_pbc_wfr_l2type(unsigned _type) { - /* Just verify WFR isn't attempting non-9B */ - assert(_type == _OPX_PBC_JKR_L2TYPE_9B_); return OPX_PBC_WFR_UNUSED; } #define OPX_PBC_WFR_L2TYPE(_type) opx_pbc_wfr_l2type(_type) @@ -64,7 +62,7 @@ #define OPX_PBC_WFR_RUNTIME(_dlid, _pidx) OPX_PBC_WFR_UNUSED /* Unused WFR field - always initialized with PBC to 0. - #define OPX_PBC_STATICRCC(srcc) (((unsigned long long)(dlid & OPX_PBC_WFR_STATICRCC_MASK) << OPX_PBC_WFR_STATICRCC_SHIFT) << OPX_PBC_MSB_SHIFT) + #define OPX_PBC_STATICRCC(srcc) (((unsigned long long)(dlid & OPX_PBC_WFR_STATICRCC_MASK) << OPX_PBC_WFR_STATICRCC_SHIFT) << OPX_MSB_SHIFT) */ /* WFR @@ -83,8 +81,8 @@ #define OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf) (_seq != (_rhf & 0xF0000000ul)) #define OPX_WFR_RHF_SEQ_INCREMENT(_seq) ((_seq < 0xD0000000ul) * _seq + 0x10000000ul) -#define OPX_WFR_IS_ERRORED_RHF(_rhf) (_rhf & 0xBFE0000000000000ul) -#define OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf) (_seq == (_rhf & 0xF0000000ul)) +#define OPX_WFR_IS_ERRORED_RHF(_rhf, _hfi1_type) (_rhf & 0xBFE0000000000000ul) +#define OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) (_seq == (_rhf & 0xF0000000ul)) #define OPX_WFR_RHF_SEQ_INIT_VAL (0x10000000ul) #define OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf) ((_rhf & 0x00008000ul) == 0x00008000ul) #define OPX_WFR_RHF_EGRBFR_INDEX_MASK (0x7FF) @@ -112,10 +110,11 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -#define OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - opx_wfr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) +#define OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + opx_wfr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) // Common to both JKR/WFR @@ -124,22 +123,24 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, #define OPX_WFR_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) /* Common (jkr) handler to WFR/JKR 9B (for now) */ -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr); +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -__OPX_FORCE_INLINE__ int opx_wfr_rhf_check_header(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +__OPX_FORCE_INLINE__ int opx_wfr_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { /* RHF error */ - if (OFI_UNLIKELY(OPX_WFR_IS_ERRORED_RHF(rhf_rcvd))) return 1; /* error */ + if (OFI_UNLIKELY(OPX_WFR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_WFR))) return 1; /* error */ /* Bad packet header */ if (OFI_UNLIKELY((!OPX_WFR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && - (ntohs(hdr->stl.lrh.pktlen) > 0x15) && + (ntohs(hdr->lrh_9B.pktlen) > 0x15) && !(OPX_WFR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) - return opx_jkr_rhf_error_handler(rhf_rcvd, hdr); /* error */ + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ else return 0; /* no error*/ } -#define OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) opx_wfr_rhf_check_header(_rhf_rcvd, _hdr) +#define OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr, _hfi1_type) opx_wfr_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type) #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index 677dc509a08..8252a907b78 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -65,6 +65,8 @@ enum fi_hmem_iface fi_opx_hmem_get_iface(const void *ptr, const struct fi_opx_mr *desc, uint64_t *device) { + assert(ptr != NULL); + #ifdef OPX_HMEM if (desc) { switch (desc->attr.iface) { @@ -221,12 +223,12 @@ int opx_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ } else { /* Perform standard rocr_memcopy*/ OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "AMD-ROCR-MEMCOPY-FROM-HMEM"); - ret = rocr_copy_to_dev(device, dest, src, len); + ret = rocr_copy_from_dev(device, dest, src, len); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "AMD-ROCR-MEMCOPY-FROM-HMEM"); } break; #endif - + default: OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "OFI-COPY-FROM-HMEM"); ret = ofi_copy_from_hmem(iface, device, dest, src, len); diff --git a/prov/opx/include/rdma/opx/fi_opx_internal.h b/prov/opx/include/rdma/opx/fi_opx_internal.h index ebd0001c040..138de08bbbb 100644 --- a/prov/opx/include/rdma/opx/fi_opx_internal.h +++ b/prov/opx/include/rdma/opx/fi_opx_internal.h @@ -41,11 +41,12 @@ #include #include +#include "rdma/opx/opx_tracer.h" + #define FI_OPX_CACHE_LINE_SIZE (64) -#define FI_OPX_CQ_CONTEXT_EXT (0x8000000000000000ull) +#define FI_OPX_CQ_CONTEXT_HMEM (0x8000000000000000ull) #define FI_OPX_CQ_CONTEXT_MULTIRECV (0x4000000000000000ull) -#define FI_OPX_CQ_CONTEXT_HMEM (0x2000000000000000ull) #define OPX_HMEM_SIZE_QWS (3) @@ -60,108 +61,42 @@ union fi_opx_mp_egr_id { }; } __attribute__((__packed__)); -union fi_opx_context { - struct fi_context2 context; - struct { - //struct slist_entry entry; /* fi_cq_entry::op_context */ - union fi_opx_context * next; /* fi_cq_entry::op_context */ - uint64_t flags; /* fi_cq_msg_entry::flags */ - size_t len; /* fi_cq_msg_entry::len */ - void *buf; /* fi_cq_data_entry::buf (unused for tagged cq's and non-multi-receive message cq's) */ - - union { - uint64_t data; /* fi_cq_data_entry::data; only used _after_ a message is matched */ - fi_addr_t src_addr; /* only used _before_ a message is matched ('FI_DIRECTED_RECEIVE') */ - }; - - union { - uint64_t tag; /* fi_cq_tagged_entry::tag */ - union fi_opx_context *multi_recv_next; /* only for multi-receives; which is not tagged */ - }; - union { - uint64_t ignore; /* only for tagged receive */ - void *claim; /* only for peek/claim */ - void *multi_recv_context; /* only for individual FI_MULTI_RECV's */ - union fi_opx_mp_egr_id mp_egr_id; - }; - - volatile uint64_t byte_counter; - }; -}; - -struct fi_opx_context_slist { - union fi_opx_context * head; - union fi_opx_context * tail; -}; - -static inline void fi_opx_context_slist_init (struct fi_opx_context_slist* list) -{ - list->head = list->tail = NULL; -} - -static inline int fi_opx_context_slist_empty (struct fi_opx_context_slist* list) -{ - return !list->head; -} - -static inline void fi_opx_context_slist_insert_head (union fi_opx_context *item, - struct fi_opx_context_slist* list) -{ - assert(item->next == NULL); - if (fi_opx_context_slist_empty(list)) - list->tail = item; - else - item->next = list->head; - - list->head = item; -} - -static inline void fi_opx_context_slist_insert_tail (union fi_opx_context *item, - struct fi_opx_context_slist* list) -{ - assert(item->next == NULL); - if (fi_opx_context_slist_empty(list)) - list->head = item; - else - list->tail->next = item; - - list->tail = item; -} - -static inline void fi_opx_context_slist_remove_item (union fi_opx_context *item, - union fi_opx_context *prev, struct fi_opx_context_slist *list) -{ - if (prev) { - prev->next = item->next; - } else { - list->head = item->next; - } +struct opx_context { + /**** CACHELINE 0 ****/ + struct opx_context *next; /* fi_cq_entry::op_context */ + uint64_t flags; /* fi_cq_msg_entry::flags */ + size_t len; /* fi_cq_msg_entry::len */ + void *buf; /* fi_cq_data_entry::buf (unused for tagged cq's and non-multi-receive message cq's) */ - if (item->next == NULL) { - list->tail = prev; - } + union { + uint64_t data; /* fi_cq_data_entry::data; only used _after_ a message is matched */ + fi_addr_t src_addr; /* only used _before_ a message is matched ('FI_DIRECTED_RECEIVE') */ + }; - item->next = NULL; -} + uint64_t tag; /* fi_cq_tagged_entry::tag */ + union { + uint64_t ignore; /* only for tagged receive */ + void *claim; /* only for peek/claim */ + void *multi_recv_context; /* only for individual FI_MULTI_RECV's */ + union fi_opx_mp_egr_id mp_egr_id; + }; -struct fi_opx_context_ext { - union fi_opx_context opx_context; - struct fi_cq_err_entry err_entry; + volatile uint64_t byte_counter; - // offset 144 bytes + /**** CACHELINE 1 & 2 ****/ + uint64_t hmem_info_qws[OPX_HMEM_SIZE_QWS]; struct { - struct fi_context2 *op_context; size_t iov_count; struct iovec *iov; } msg; - // offset 168 bytes - uint64_t hmem_info_qws[OPX_HMEM_SIZE_QWS]; - - // 184 bytes - uint64_t unused; -} __attribute__((__aligned__(32))); + struct fi_cq_err_entry err_entry; // 88 bytes +} __attribute__((__packed__)) __attribute__((__aligned__(64))); +static_assert(offsetof(struct opx_context, hmem_info_qws) == FI_OPX_CACHE_LINE_SIZE, + "struct opx_context.hmem_info_qws offset should start at Cacheline 1!"); +static_assert(sizeof(struct opx_context) == (FI_OPX_CACHE_LINE_SIZE * 3), + "sizeof(struct opx_context) should be equal to 3 cachelines!"); struct opx_sdma_queue { struct slist list; @@ -204,7 +139,12 @@ static inline int fi_opx_threading_lock_required(const enum fi_threading threadi static inline void fi_opx_lock_if_required (ofi_spin_t *lock, const int required) { - if (required) ofi_spin_lock(lock); + if (required) { + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "LOCK"); + ofi_spin_lock(lock); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "LOCK"); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "LOCK-HELD"); + } } static inline void fi_opx_lock (ofi_spin_t *lock) @@ -214,7 +154,12 @@ static inline void fi_opx_lock (ofi_spin_t *lock) static inline void fi_opx_unlock_if_required (ofi_spin_t *lock, const int required) { - if (required) ofi_spin_unlock(lock); + if (required) { + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "LOCK-HELD"); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "UNLOCK"); + ofi_spin_unlock(lock); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "UNLOCK"); + } } static inline void fi_opx_unlock (ofi_spin_t *lock) diff --git a/prov/opx/include/rdma/opx/fi_opx_match.h b/prov/opx/include/rdma/opx/fi_opx_match.h index 2b0176fedce..3bc62703896 100644 --- a/prov/opx/include/rdma/opx/fi_opx_match.h +++ b/prov/opx/include/rdma/opx/fi_opx_match.h @@ -223,7 +223,6 @@ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt_by_tag(struct fi_opx_match struct fi_opx_debug_counters *debug_counters) { struct fi_opx_hfi1_ue_packet *uepkt = ue_hash->tag_ht[hash_index].head; - assert(uepkt); FI_OPX_DEBUG_COUNTERS_INC(debug_counters->match.ue_hash_tag_searches); @@ -241,7 +240,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt_by_tag(struct fi_opx_match __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt(struct fi_opx_match_ue_hash *ue_hash, - const union fi_opx_context *context, + const struct opx_context *context, struct fi_opx_debug_counters *debug_counters) { if (!ue_hash->ue.head) { diff --git a/prov/opx/include/rdma/opx/fi_opx_reliability.h b/prov/opx/include/rdma/opx/fi_opx_reliability.h index 065e9dc28ce..61046b9c12b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_reliability.h +++ b/prov/opx/include/rdma/opx/fi_opx_reliability.h @@ -83,7 +83,7 @@ struct fi_opx_completion_counter { struct fi_opx_cntr *cntr; struct fi_opx_cq *cq; union { - union fi_opx_context *context; + struct opx_context *context; void *container; }; void (*hit_zero)(struct fi_opx_completion_counter*); @@ -118,6 +118,12 @@ union fi_opx_reliability_deferred_work { struct fi_opx_reliability_tx_pio_replay_params pio_replay; }; +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN (1) +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX (65535) +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT (128) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN (1) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX (65535) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT (4) struct fi_opx_reliability_service { struct fi_opx_atomic_fifo fifo; /* 27 qws = 216 bytes */ @@ -134,7 +140,10 @@ struct fi_opx_reliability_service { /* == CACHE LINE == */ RbtHandle flow; /* 1 qw = 8 bytes */ uint64_t ping_start_key; - uint64_t unused; + uint16_t max_uncongested_pings; + uint16_t max_congested_pings; + uint8_t congested_flag; + uint8_t unused_padding2[3]; struct { uint64_t unused_cacheline_1; @@ -144,9 +153,12 @@ struct fi_opx_reliability_service { volatile uint64_t * pio_scb_first; /* == CACHE LINE == */ - struct fi_opx_hfi1_txe_scb ping_model; - struct fi_opx_hfi1_txe_scb ack_model; - struct fi_opx_hfi1_txe_scb nack_model; + struct fi_opx_hfi1_txe_scb_9B ping_model_9B; + struct fi_opx_hfi1_txe_scb_9B ack_model_9B; + struct fi_opx_hfi1_txe_scb_9B nack_model_9B; + struct fi_opx_hfi1_txe_scb_16B ping_model_16B; + struct fi_opx_hfi1_txe_scb_16B ack_model_16B; + struct fi_opx_hfi1_txe_scb_16B nack_model_16B; } hfi1; } tx __attribute__((__packed__));; @@ -272,10 +284,16 @@ struct fi_opx_reliability_tx_replay { /* == CACHE LINE == */ /* --- MUST BE 64 BYTE ALIGNED --- */ - struct fi_opx_hfi1_txe_scb scb; + union opx_hfi1_txe_scb_union scb; + uint8_t data[]; } __attribute__((__aligned__(64))); +#define OPX_REPLAY_HDR(_replay) OPX_REPLAY_HDR_TYPE(_replay, OPX_HFI1_TYPE) + +#define OPX_REPLAY_HDR_TYPE(_replay,_hfi1_type) ((_hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? \ + (&((_replay)->scb.scb_9B.hdr)) : (&((_replay)->scb.scb_16B.hdr)) ) + OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_reliability_tx_replay, sdma_we) == FI_OPX_CACHE_LINE_SIZE, "Reliability Replay sdma_we should start on first cacheline!"); OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_reliability_tx_replay, scb) & (FI_OPX_CACHE_LINE_SIZE - 1)) == 0, @@ -360,6 +378,13 @@ RbtIterator fi_opx_rbt_begin(RbtHandle h) { return i != &rbt->sentinel ? i : NULL; } +__OPX_FORCE_INLINE__ +void fi_opx_rbt_key(RbtIterator it, uint64_t *key) { + NodeType *i = it; + + *key = (uint64_t) i->key; +} + __OPX_FORCE_INLINE__ void fi_opx_rbt_key_value(RbtHandle h, RbtIterator it, void **key, void **val) { NodeType *i = it; @@ -376,6 +401,8 @@ void fi_opx_rbt_key_value(RbtHandle h, RbtIterator it, void **key, void **val) { uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * service, uuid_t unique_job_key, struct fi_opx_hfi1_context * hfi1, const enum ofi_reliability_kind reliability_kind); +void fi_opx_reliability_model_init_16B (struct fi_opx_reliability_service * service, + struct fi_opx_hfi1_context * hfi1); void fi_opx_reliability_service_fini (struct fi_opx_reliability_service * service); void fi_reliability_service_ping_remote (struct fid_ep *ep, struct fi_opx_reliability_service * service); @@ -400,7 +427,7 @@ struct fi_opx_reliability_rx_uepkt { /* == CACHE LINE == */ uint64_t unused_1; - union fi_opx_hfi1_packet_hdr hdr; /* 56 bytes */ + union opx_hfi1_packet_hdr hdr; /* 56 bytes */ /* == CACHE LINE == */ @@ -417,7 +444,7 @@ union fi_opx_reliability_tx_psn { uint64_t bytes_outstanding:24; } psn; } __attribute__((__packed__)); - + // TODO - make these tunable. #define FI_OPX_RELIABILITY_TX_REPLAY_BLOCKS (2048) #define FI_OPX_RELIABILITY_TX_REPLAY_IOV_BLOCKS (8192) @@ -531,7 +558,7 @@ struct fi_opx_reliability_client_state { // 88 bytes struct fi_opx_reliability_service * service; void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs); // 104 bytes @@ -553,7 +580,7 @@ void fi_opx_reliability_client_init (struct fi_opx_reliability_client_state * st const uint8_t rx, const uint8_t tx, void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_reliability_rx)); @@ -568,31 +595,34 @@ unsigned fi_opx_reliability_client_active (struct fi_opx_reliability_client_stat static inline void fi_reliability_service_process_command (struct fi_opx_reliability_client_state *state, - struct fi_opx_reliability_tx_replay * replay) + struct fi_opx_reliability_tx_replay * replay, + uint32_t slid, uint32_t dlid, + uint8_t tx, uint8_t rx, + const enum opx_hfi1_type hfi1_type) { union fi_opx_reliability_service_flow_key key = { - .slid = replay->scb.hdr.stl.lrh.slid, - .tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr), - .dlid = replay->scb.hdr.stl.lrh.dlid, - .rx = replay->scb.hdr.stl.bth.rx + .slid = slid, + .tx = tx, + .dlid = dlid, + .rx = rx }; void * itr = NULL; #ifdef OPX_RELIABILITY_DEBUG - fprintf(stderr, "(tx) packet %016lx %08u posted.\n", key.value, FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); + fprintf(stderr, "(tx) packet %016lx %08u posted.\n", key.value, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR_TYPE(replay, hfi1_type))); #endif #ifndef NDEBUG itr = fi_opx_rbt_find(state->tx_flow_rbtree, (void*)key.value); if (itr == NULL) { - fprintf(stderr, "(%d) %s:%s():%d [%016lX] [slid=%04hX tx=%08X dlid=%04hX rx=%0hhX] Error trying to register replay for flow with no handshake!\n", + fprintf(stderr, "(%d) %s:%s():%d [%016lX] [slid=%08X tx=%08X dlid=%08X rx=%0hhX] Error trying to register replay for flow with no handshake!\n", getpid(), __FILE__, __func__, __LINE__, key.value, - replay->scb.hdr.stl.lrh.slid, - FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr), - replay->scb.hdr.stl.lrh.dlid, - replay->scb.hdr.stl.bth.rx); + slid, + FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, hfi1_type)), + dlid, + OPX_REPLAY_HDR_TYPE(replay, hfi1_type)->bth.rx); assert(itr); } #endif @@ -642,7 +672,7 @@ void fi_reliability_service_process_command (struct fi_opx_reliability_client_st // Debugging tool that deliberately drops packets. static inline uint16_t fi_opx_reliability_rx_drop_packet (struct fi_opx_reliability_client_state * state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { /* * Two variations of when to drop packets. The first drops a percentage of the @@ -651,12 +681,12 @@ uint16_t fi_opx_reliability_rx_drop_packet (struct fi_opx_reliability_client_sta * use either of these or code up something different depending on what you're * trying to debug. */ -#if 0 +#if 1 // drops a percentage of the packets based on drop_mask. const uint16_t tmp = state->drop_count & state->drop_mask; if (tmp == 0) - FI_WARN(fi_opx_global.prov,FI_LOG_EP_DATA, + FI_WARN(fi_opx_global.prov,FI_LOG_EP_DATA, "DEBUG: discarding packet %hu\n", state->drop_count); state->drop_count = tmp + 1; @@ -685,7 +715,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_init(struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode); + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, const uint64_t key, @@ -701,9 +732,16 @@ size_t fi_opx_reliability_replay_get_payload_size(struct fi_opx_reliability_tx_r } /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - return total_bytes - sizeof(union fi_opx_hfi1_packet_hdr); + /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint16_t lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); + const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); + } else { + const uint16_t lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; + const size_t total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); + } } __OPX_FORCE_INLINE__ @@ -735,7 +773,7 @@ void fi_opx_reliability_create_rx_flow(struct fi_opx_reliability_client_state * __OPX_FORCE_INLINE__ void fi_opx_reliability_handle_ud_init(struct fid_ep *ep, struct fi_opx_reliability_client_state *state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { union fi_opx_reliability_service_flow_key key = { .value = hdr->service.key @@ -754,12 +792,13 @@ void fi_opx_reliability_handle_ud_init(struct fid_ep *ep, #endif } - fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.slid, origin_rx, FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK); + fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.slid, origin_rx, FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK, + OPX_HFI1_TYPE); } __OPX_FORCE_INLINE__ void fi_opx_reliability_handle_ud_init_ack(struct fi_opx_reliability_client_state *state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { /* Find the flow for this communication in flow_rbtree */ union fi_opx_reliability_service_flow_key key = { @@ -838,31 +877,35 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * state, uint64_t slid, uint64_t origin_tx, uint32_t psn, - struct fid_ep *ep, const union fi_opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload); + struct fid_ep *ep, const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, + const uint16_t pktlen, const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const uint64_t opcode); + const uint64_t opcode, const enum opx_hfi1_type hfi1_type); void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint8_t origin_rx); + const union opx_hfi1_packet_hdr *const hdr, + const uint8_t origin_rx, + uint32_t slid, + const enum opx_hfi1_type hfi1_type); void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, uint32_t origin_reliability_rx, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr); void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr); void opx_reliability_handshake_init(struct fid_ep *ep, union fi_opx_reliability_service_flow_key key, - const uint64_t target_reliability_rx); + const uint64_t target_reliability_rx, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ int32_t fi_opx_reliability_tx_max_outstanding () { @@ -904,7 +947,7 @@ bool opx_reliability_ready(struct fid_ep *ep, { /* Not using reliability, or it's Intranode */ - if (fi_opx_hfi_is_intranode(dlid)) + if (opx_lid_is_intranode(dlid)) return true; union fi_opx_reliability_service_flow_key key = { @@ -917,7 +960,7 @@ bool opx_reliability_ready(struct fid_ep *ep, void * itr = fi_opx_rbt_find(state->tx_flow_rbtree, (void*)key.value); if (OFI_UNLIKELY(!itr)) { /* Reliability handshake is incomplete, initiate it */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); return false; } @@ -949,7 +992,7 @@ int32_t fi_opx_reliability_tx_available_psns (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); OPX_TRACER_TRACE_SDMA(OPX_TRACER_END_EAGAIN_SDMA_PSNS, "GET_PSNS"); return -1; } @@ -1012,7 +1055,7 @@ int32_t fi_opx_reliability_tx_next_psn (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); return -1; } else { *psn_ptr = (union fi_opx_reliability_tx_psn *)fi_opx_rbt_value_ptr(state->tx_flow_rbtree, itr); @@ -1088,10 +1131,10 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, const uint64_t target_reliability_rx, union fi_opx_reliability_tx_psn **psn_ptr, struct fi_opx_reliability_tx_replay **replay, - const enum ofi_reliability_kind reliability - ) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { - + union fi_opx_reliability_service_flow_key key = { .slid = (uint32_t) state->lid_be, .tx = (uint32_t) state->tx, @@ -1104,10 +1147,10 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, hfi1_type); return -1; } - + *psn_ptr = (union fi_opx_reliability_tx_psn *)fi_opx_rbt_value_ptr(state->tx_flow_rbtree, itr); union fi_opx_reliability_tx_psn psn_value = **psn_ptr; @@ -1134,7 +1177,7 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, fi_opx_reliability_inc_throttle_maxo(ep); return -1; } - + *replay = fi_opx_reliability_client_replay_allocate(state, false); if (*replay == NULL) { return -1; @@ -1159,13 +1202,34 @@ void fi_opx_reliability_client_replay_deallocate(struct fi_opx_reliability_clien static inline void fi_opx_reliability_client_replay_register_no_update (struct fi_opx_reliability_client_state * state, - const uint16_t dlid, const uint8_t rs, const uint8_t rx, union fi_opx_reliability_tx_psn *psn_ptr, + const uint8_t rs, const uint8_t rx, union fi_opx_reliability_tx_psn *psn_ptr, struct fi_opx_reliability_tx_replay * replay, - const enum ofi_reliability_kind reliability_kind) + const enum ofi_reliability_kind reliability_kind, + const enum opx_hfi1_type hfi1_type) { - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + uint32_t hdr_dlid; + uint8_t hdr_tx; + uint8_t hdr_rx; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + hdr_dlid = replay->scb.scb_9B.hdr.lrh_9B.dlid; + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; + } else { + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + hdr_dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; + } psn_ptr->psn.bytes_outstanding += total_bytes; + replay->target_reliability_rx = rs; replay->psn_ptr = psn_ptr; @@ -1190,7 +1254,7 @@ void fi_opx_reliability_client_replay_register_no_update (struct fi_opx_reliabil fi_opx_atomic_fifo_produce(&state->fifo, (uint64_t)replay | TX_CMD); } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD || reliability_kind == OFI_RELIABILITY_KIND_RUNTIME) { /* constant compile-time expression */ - fi_reliability_service_process_command(state, replay); + fi_reliability_service_process_command(state, replay, state->lid_be, hdr_dlid, hdr_tx, hdr_rx, hfi1_type); } else { fprintf(stderr, "%s():%d abort\n", __func__, __LINE__); abort(); } @@ -1206,8 +1270,29 @@ void fi_opx_reliability_client_replay_register_with_update (struct fi_opx_reliab struct fi_opx_completion_counter * counter, uint64_t value, const enum ofi_reliability_kind reliability_kind) { - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + uint32_t hdr_dlid; + uint8_t hdr_tx; + uint8_t hdr_rx; + + /* global note: runtime HFI1 type - may need macro/inlining/const parameter hfi1_type to be branchless */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + hdr_dlid = replay->scb.scb_9B.hdr.lrh_9B.dlid; + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; + } else { + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + hdr_dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; + } + psn_ptr->psn.bytes_outstanding += total_bytes; replay->target_reliability_rx = rs; replay->psn_ptr = psn_ptr; @@ -1238,8 +1323,7 @@ void fi_opx_reliability_client_replay_register_with_update (struct fi_opx_reliab fi_opx_atomic_fifo_produce(&state->fifo, (uint64_t)replay | TX_CMD); } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD || reliability_kind == OFI_RELIABILITY_KIND_RUNTIME) { /* constant compile-time expression */ - - fi_reliability_service_process_command(state, replay); + fi_reliability_service_process_command(state, replay, state->lid_be, hdr_dlid, hdr_tx, hdr_rx, OPX_HFI1_TYPE); } else { fprintf(stderr, "%s():%d abort\n", __func__, __LINE__); abort(); diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 2a21e70ddc1..c088686fae9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -43,7 +43,13 @@ extern "C" { #endif -int fi_opx_check_rma(struct fi_opx_ep *opx_ep); +__OPX_FORCE_INLINE__ +int fi_opx_check_rma(struct fi_opx_ep *opx_ep) +{ + return OFI_UNLIKELY(!opx_ep || + (opx_ep->state != FI_OPX_EP_INITITALIZED_ENABLED) || + (opx_ep->av->type == FI_AV_UNSPEC)) ? -FI_EINVAL : 0; +} void fi_opx_hit_zero(struct fi_opx_completion_counter *cc); @@ -57,7 +63,6 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const union fi_opx_addr opx_target_addr, const uint64_t *addr_offset, const uint64_t *key, - union fi_opx_context *opx_context, const uint64_t tx_op_flags, const struct fi_opx_cq *opx_cq, const struct fi_opx_cntr *opx_cntr, @@ -67,8 +72,10 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const uint32_t opcode, const int lock_required, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READV_INTERNAL"); union fi_opx_hfi1_deferred_work *work = (union fi_opx_hfi1_deferred_work *) ofi_buf_alloc(opx_ep->tx->work_pending_pool); @@ -86,13 +93,23 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, params->dest_rx = opx_target_addr.hfi1_rx; params->bth_rx = params->dest_rx << 56; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_target_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); - params->pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ - params->lrh_dws = htons(params->pbc_dws - 1); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + params->lrh_dws = htons(params->pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + params->pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 16 + /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + 2; /* ICRC/tail */ + params->lrh_dws = (params->pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } params->is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, opx_target_addr, caps); params->reliability = reliability; params->opcode = opcode; @@ -142,6 +159,7 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { OPX_BUF_FREE(work); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV_INTERNAL"); return; } assert(rc == -FI_EAGAIN); @@ -149,6 +167,8 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV_INTERNAL"); } __OPX_FORCE_INLINE__ @@ -157,14 +177,15 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const size_t niov, const union fi_opx_addr opx_dst_addr, uint64_t addr_offset, const uint64_t key, - union fi_opx_context *opx_context, struct fi_opx_completion_counter *cc, enum fi_datatype dt, enum fi_op op, const uint64_t tx_op_flags, const uint64_t is_hmem, const int lock_required, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITE_INTERNAL"); assert(niov == 1); // TODO, support something ... bigger assert(op == FI_NOOP || op < OFI_ATOMIC_OP_LAST); assert(dt == FI_VOID || dt < OFI_DATATYPE_LAST); @@ -179,7 +200,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, params->work_elem.complete = false; params->opx_ep = opx_ep; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_dst_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->slid = opx_dst_addr.uid.lid; params->origin_rs = opx_dst_addr.reliability_rx; params->dt = dt == FI_VOID ? FI_VOID-1 : dt; @@ -209,28 +230,23 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, params->payload_bytes_for_iovec = 0; params->target_hfi_unit = opx_dst_addr.hfi1_unit; - /* Possible SHM connections required for certain applications (i.e., DAOS) - * exceeds the max value of the legacy u8_rx field. Use u32_extended field. - */ - ssize_t rc = fi_opx_shm_dynamic_tx_connect(params->is_intranode, opx_ep, params->u32_extended_rx, opx_dst_addr.hfi1_unit); - assert(rc == FI_SUCCESS); - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); - fi_opx_hfi1_dput_sdma_init(opx_ep, params, iov->len, 0, 0, NULL, is_hmem); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && params->is_intranode, opx_ep->debug_counters.hmem.rma_write_intranode); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && !params->is_intranode, opx_ep->debug_counters.hmem.rma_write_hfi); - rc = params->work_elem.work_fn(work); + ssize_t rc = params->work_elem.work_fn(work); if (rc == FI_SUCCESS) { assert(params->work_elem.complete); OPX_BUF_FREE(work); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); return; } assert(rc == -FI_EAGAIN); if (params->work_elem.work_type == OPX_WORK_TYPE_LAST) { slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending_completion); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); return; } @@ -252,45 +268,75 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); } +__OPX_FORCE_INLINE__ +ssize_t opx_rma_get_context(struct fi_opx_ep *opx_ep, const void *user_context, + const void *cq, const uint64_t flags, + struct opx_context **context) +{ + if (!cq || !user_context) { + *context = NULL; + return FI_SUCCESS; + } + + struct opx_context *ctx = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(ctx == NULL)) { + *context = NULL; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + ctx->next = NULL; + ctx->flags = (uint64_t) flags; + ctx->err_entry.err = 0; + ctx->err_entry.op_context = (void *) user_context; + + *context = ctx; + return FI_SUCCESS; +} ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability); + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability); + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); #ifdef __cplusplus } diff --git a/prov/opx/include/rdma/opx/fi_opx_rma_ops.h b/prov/opx/include/rdma/opx/fi_opx_rma_ops.h index fd0b118b241..3c0ec8f916f 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma_ops.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma_ops.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,61 +38,61 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_RMA_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_RMA_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) \ - static inline ssize_t fi_opx_writemsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ +#define FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static inline ssize_t fi_opx_writemsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) \ { \ - return fi_opx_writemsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY); \ + return fi_opx_writemsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_writev_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_writev_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, \ fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_writev_generic(ep, iov, desc, count, dest_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, \ uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_write_generic(ep, buf, len, desc, dst_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_inject_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_inject_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dst_addr, \ uint64_t addr_offset, uint64_t key) \ { \ return fi_opx_inject_write_generic(ep, buf, len, dst_addr, addr_offset, key, LOCK, \ - AV, CAPS, RELIABILITY); \ + AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_readmsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_readmsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) \ { \ - return fi_opx_readmsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY); \ + return fi_opx_readmsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_readv_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_readv_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, \ fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_writev_generic(ep, iov, desc, count, src_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_read_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_read_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, \ uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } -#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY +#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE /* diff --git a/prov/opx/include/rdma/opx/fi_opx_tagged.h b/prov/opx/include/rdma/opx/fi_opx_tagged.h index 11bdfb5391a..ccf378a050e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_tagged.h +++ b/prov/opx/include/rdma/opx/fi_opx_tagged.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,12 +38,12 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_TAGGED_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) +#define FI_OPX_TAGGED_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tsend_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tsend_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, fi_addr_t dest_addr, \ uint64_t tag, void *context) \ @@ -56,48 +56,52 @@ 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_trecv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_trecv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, void *buf, size_t len, \ void *desc, fi_addr_t src_addr, uint64_t tag, \ uint64_t ignore, void *context) \ { \ return fi_opx_recv_generic(ep, buf, len, desc, \ src_addr, tag, ignore, context, \ - LOCK, AV, FI_TAGGED, RELIABILITY); \ + LOCK, AV, FI_TAGGED, RELIABILITY, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tinject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tinject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ fi_addr_t dest_addr, uint64_t tag) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, tag, 0, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + 0, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ - __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tsenddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + __OPX_FORCE_INLINE__ ssize_t \ + fi_opx_tsenddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, uint64_t data, fi_addr_t dest_addr, \ uint64_t tag, void *context) \ { \ return fi_opx_ep_tx_send(ep, buf, len, desc, \ dest_addr, tag, context, data, \ - LOCK, /* lock_required */ \ - AV, /* av_type */ \ - 1, /* is_contiguous */ \ - 0, /* override_flags */ \ - 0, /* flags */ \ + LOCK, /* lock_required */ \ + AV, /* av_type */ \ + 1, /* is_contiguous */ \ + 0, /* override_flags */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ - __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tinjectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + __OPX_FORCE_INLINE__ ssize_t \ + fi_opx_tinjectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr, \ uint64_t tag) \ @@ -106,14 +110,16 @@ dest_addr, tag, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE #endif /* _FI_PROV_OPX_TAGGED_H_ */ diff --git a/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h b/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h index 2b57bc16115..c83eb5b75da 100644 --- a/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h +++ b/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h @@ -38,6 +38,7 @@ #include #include "fi_opx_hfi1.h" +#include "ofi_mem.h" /* Implementation PRE-CN5000 */ #ifdef OPX_PRE_CN5000 @@ -116,7 +117,7 @@ int opx_get_port(struct hfi1_user_info_dep *uinfo) #define OPX_HFI1_MMAP_MAGIC 0xdabbad00 -#define opx_offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +#define opx_offset_in_page(p) ((unsigned long)(p) & (page_sizes[OFI_PAGE_SIZE]-1)) #define OPX_HFI1_MMAP_TOKEN_SET(field, val) \ (((val) & OPX_HFI1_MMAP_##field##_MASK) << OPX_HFI1_MMAP_##field##_SHIFT) diff --git a/prov/opx/include/rdma/opx/opx_hfi1_sim.h b/prov/opx/include/rdma/opx/opx_hfi1_sim.h index e9be731cb52..0f2906b6ef9 100644 --- a/prov/opx/include/rdma/opx/opx_hfi1_sim.h +++ b/prov/opx/include/rdma/opx/opx_hfi1_sim.h @@ -49,15 +49,15 @@ void opx_sim_store(uint64_t offset, uint64_t *value, const char* func, const int line) { long ret, loffset = (long) offset; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "%s:%u FI_OPX_HFI1_BAR_STORE: offset %#16.16lX\n", func,line,offset); ret = lseek(fi_opx_global.hfi_local_info.sim_fd, offset, SEEK_SET); if (ret != loffset) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "%s:%u FI_OPX_HFI1_BAR_STORE: offset %#16.16lX\n", func,line,offset); perror("FI_OPX_HFI1_BAR_STORE: Unable to lseek BAR: "); sleep(5); abort(); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "FI_OPX_HFI1_BAR_STORE: value %#16.16lX\n", *value); + "%s:%u FI_OPX_HFI1_BAR_STORE: %#16.16lX value [%#16.16lX]\n", func,line,offset, *value); if (write(fi_opx_global.hfi_local_info.sim_fd, value, sizeof(*value)) < 0) { perror("FI_OPX_HFI1_BAR_STORE: Unable to write BAR: "); sleep(5); abort(); @@ -101,10 +101,7 @@ assert(unit < 2); const char* filename = sim_barfiles[unit]; - #if (!defined(OPX_WFR) && !defined(OPX_JKR)) - fprintf(stderr, "Simulator MUST be built with OPX_WFR or OPX_JKR\n"); - abort(); - #endif + if (getenv("HFI_FNAME")) { filename = getenv("HFI_FNAME"); } @@ -134,7 +131,7 @@ #define OPX_HFI1_INIT_PIO_SOP(context, input) ({ \ volatile uint64_t * __pio_sop; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __pio_sop = (uint64_t *) \ (OPX_TXE_PIO_SEND + \ (context * (64*1024L)) + \ @@ -152,7 +149,7 @@ #define OPX_HFI1_INIT_PIO(context, input) ({ \ volatile uint64_t * __pio; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __pio = (uint64_t *)(OPX_TXE_PIO_SEND + \ (context * (64*1024L))); \ } else { \ @@ -167,7 +164,7 @@ #define OPX_HFI1_INIT_UREGS(context, input) ({ \ volatile uint64_t * __uregs; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __uregs = (uint64_t *)(OPX_WFR_RXE_PER_CONTEXT_OFFSET + \ ((context) * OPX_WFR_RXE_UCTX_STRIDE)); \ } else { \ diff --git a/prov/opx/include/rdma/opx/opx_tracer.h b/prov/opx/include/rdma/opx/opx_tracer.h index b07fa80a2e3..a3ce127c918 100644 --- a/prov/opx/include/rdma/opx/opx_tracer.h +++ b/prov/opx/include/rdma/opx/opx_tracer.h @@ -118,7 +118,7 @@ int opx_tracer_enabled() } __OPX_FORCE_INLINE__ -void opx_tracer_trace(enum opx_tracer_status status, +void opx_tracer_trace(enum opx_tracer_status status, const char *func, int line, const char *msg) { struct timespec ts; @@ -132,20 +132,20 @@ void opx_tracer_trace(enum opx_tracer_status status, timestamp, opx_tracer.pid, func, line, OPX_TRACER_STATUS_STR[status], msg); } -#if defined(OPX_TRACER) || defined(OPX_TRACER_SDMA) || defined(OPX_TRACER_RELI) +#if defined(OPX_TRACER) || defined(OPX_TRACER_SDMA) || defined(OPX_TRACER_RELI) || defined(OPX_TRACER_LOCK_IF_REQUIRED) #define OPX_TRACER_INIT() opx_tracer_init() -#define OPX_TRACER_TRACE(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ +#define OPX_TRACER_TRACE(status, fmt, ...) \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #define OPX_TRACER_EXIT() opx_tracer_exit() @@ -159,15 +159,15 @@ void opx_tracer_trace(enum opx_tracer_status status, #if defined(OPX_TRACER_SDMA) #define OPX_TRACER_TRACE_SDMA(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #else @@ -177,19 +177,37 @@ void opx_tracer_trace(enum opx_tracer_status status, #if defined(OPX_TRACER_RELI) #define OPX_TRACER_TRACE_RELI(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #else #define OPX_TRACER_TRACE_RELI(status, ...) #endif +#if defined(OPX_TRACER_LOCK_IF_REQUIRED) + +#define OPX_TRACER_TRACE_LOCK_IF_REQUIRED(status, fmt, ...) \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ + snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ + } while (0) + +#else +#define OPX_TRACER_TRACE_LOCK_IF_REQUIRED(status, ...) +#endif + #endif diff --git a/prov/opx/src/fi_opx_atomic.c b/prov/opx/src/fi_opx_atomic.c index a69512a34a9..32069d853e5 100644 --- a/prov/opx/src/fi_opx_atomic.c +++ b/prov/opx/src/fi_opx_atomic.c @@ -113,7 +113,6 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const uint64_t key, const struct fi_opx_hmem_iov *fetch_iov, const struct fi_opx_hmem_iov *compare_iov, - union fi_opx_context *opx_context, const uint64_t tx_op_flags, const struct fi_opx_cq *opx_cq, const struct fi_opx_cntr *opx_cntr, @@ -122,7 +121,8 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const int lock_required, const uint64_t caps, const enum ofi_reliability_kind reliability, const uint64_t is_hmem, - const uint64_t is_intranode) + const uint64_t is_intranode, + const enum opx_hfi1_type hfi1_type) { if (tx_op_flags & FI_INJECT) { assert((tx_op_flags & (FI_COMPLETION | FI_TRANSMIT_COMPLETE)) != @@ -145,7 +145,7 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, params->work_elem.complete = false; params->opx_ep = opx_ep; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_dst_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->slid = opx_dst_addr.uid.lid; params->origin_rs = opx_dst_addr.reliability_rx; params->dt = dt == FI_VOID ? FI_VOID-1 : dt; @@ -193,7 +193,7 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, rma_request->hmem_device = fetch_iov->device; params->rma_request_vaddr = (uintptr_t) rma_request; - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_hfi1_dput_sdma_init(opx_ep, params, buf_iov->len, 0, 0, NULL, is_hmem); @@ -251,12 +251,13 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const union fi_opx_addr opx_dst_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context, struct fi_opx_completion_counter *cc, + struct fi_opx_completion_counter *cc, const unsigned is_fetch, const void *fetch_vaddr, const unsigned is_compare, const void *compare_vaddr, const uint64_t tx_op_flags, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert((is_fetch == 0) || (is_fetch == 1)); assert((is_compare == 0) || (is_compare == 1)); @@ -272,10 +273,10 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, cc->cntr = opx_ep->read_cntr; fi_opx_readv_internal(opx_ep, &fetch_iov, 1, opx_dst_addr, &addr, &key, - (union fi_opx_context *)context, opx_ep->tx->op_flags, + opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, FI_OPX_HFI_DPUT_OPCODE_GET, - lock_required, caps, reliability); + lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC READ (end)\n"); return count; @@ -306,10 +307,10 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, fi_opx_atomic_op_internal(opx_ep, FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH, &buf_iov, opx_dst_addr, addr, key, &fetch_iov, - NULL, (union fi_opx_context *)context, - opx_ep->tx->op_flags, opx_ep->rx->cq, + NULL, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode); + lock_required, caps, reliability, is_hmem, + is_intranode, hfi1_type); } else { struct fi_opx_hmem_iov compare_iov; @@ -323,10 +324,10 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, fi_opx_atomic_op_internal(opx_ep, FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH, &buf_iov, opx_dst_addr, addr, key, &fetch_iov, - &compare_iov, (union fi_opx_context *)context, - opx_ep->tx->op_flags, opx_ep->rx->cq, + &compare_iov, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode); + lock_required, caps, reliability, is_hmem, + is_intranode, hfi1_type); } FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC %s (end)\n", @@ -345,10 +346,9 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, uint64_t is_hmem = fi_opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov); fi_opx_write_internal(opx_ep, &buf_iov, 1, opx_dst_addr, addr, key, - (union fi_opx_context *)NULL, cc, - datatype, op, opx_ep->tx->op_flags, - is_hmem, lock_required, caps, - reliability); + cc, datatype, op, opx_ep->tx->op_flags, + is_hmem, lock_required, caps, reliability, + hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC WRITE (end)\n"); @@ -358,8 +358,9 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dst_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context, const int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + void *user_context, const int lock_required, const enum fi_av_type av_type, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; @@ -379,28 +380,36 @@ ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, opx_addr.hfi1_rx, opx_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; - cc->cq = (((opx_ep->tx->op_flags & FI_COMPLETION) == FI_COMPLETION) || - ((opx_ep->tx->op_flags & FI_DELIVERY_COMPLETE) == FI_DELIVERY_COMPLETE)) ? - opx_ep->rx->cq : - NULL; + cc->cq = cq; cc->context = context; cc->hit_zero = fi_opx_hit_zero; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, - context, cc, 0, NULL, 0, NULL, opx_ep->tx->op_flags, - lock_required, av_type, caps, reliability); + cc, 0, NULL, 0, NULL, opx_ep->tx->op_flags, + lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); return 0; @@ -413,7 +422,8 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -442,11 +452,25 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -455,11 +479,8 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; const size_t dtsize = sizeofdt(datatype); @@ -481,8 +502,8 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, - op, NULL, cc, 0, NULL, 0, NULL, flags, lock_required, - av_type, caps, reliability); + op, cc, 0, NULL, 0, NULL, flags, lock_required, + av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -518,7 +539,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -547,7 +569,7 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -564,7 +586,21 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, uint64_t rst_iov_dtcount = resultv[rst_iov_index].count; uintptr_t rst_iov_vaddr = (uintptr_t)resultv[rst_iov_index].addr; + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC| FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = 0; ssize_t index = 0; @@ -572,12 +608,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, cc->byte_counter += sizeofdt(datatype) * msg->msg_iov[index].count; } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_READ; - - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; if (op != FI_ATOMIC_READ) { /* likely */ @@ -593,9 +625,10 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, - rma_iov_key, datatype, op, NULL, cc, 1, + rma_iov_key, datatype, op, cc, 1, (const void *)rst_iov_vaddr, 0, NULL, flags, - lock_required, av_type, caps, reliability); + lock_required, av_type, caps, reliability, + hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -636,8 +669,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, while (rma_iov_dtcount != 0 && rst_iov_dtcount != 0) { const size_t count_transfered = fi_opx_atomic_internal( opx_ep, NULL, count_requested, opx_dst_addr, rma_iov_addr, - rma_iov_key, datatype, op, NULL, cc, 1, (const void *)rst_iov_vaddr, - 0, NULL, flags, lock_required, av_type, caps, reliability); + rma_iov_key, datatype, op, cc, 1, (const void *)rst_iov_vaddr, + 0, NULL, flags, lock_required, av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -678,7 +711,8 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -707,7 +741,7 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -734,7 +768,21 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, uint64_t cmp_iov_dtcount = comparev[cmp_iov_index].count; uintptr_t cmp_iov_vaddr = (uintptr_t)comparev[cmp_iov_index].addr; + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC | FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = 0; ssize_t index; @@ -742,11 +790,8 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, cc->byte_counter += sizeofdt(datatype)* msg->msg_iov[index].count; } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_READ; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; while (msg_iov_dtcount != 0 && rma_iov_dtcount != 0 && rst_iov_dtcount != 0 && @@ -757,9 +802,9 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, - op, NULL, cc, 1, (const void *)rst_iov_vaddr, 1, + op, cc, 1, (const void *)rst_iov_vaddr, 1, (const void *)cmp_iov_vaddr, flags, lock_required, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -813,9 +858,10 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_fetch_compare_atomic_generic( struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, int lock_required, + uint64_t key, enum fi_datatype datatype, enum fi_op op, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; @@ -835,43 +881,54 @@ ssize_t fi_opx_fetch_compare_atomic_generic( opx_addr.hfi1_rx, opx_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; - cc->cq = (((opx_ep->tx->op_flags & FI_COMPLETION) == FI_COMPLETION) || - ((opx_ep->tx->op_flags & FI_DELIVERY_COMPLETE) == FI_DELIVERY_COMPLETE)) ? - opx_ep->rx->cq : - NULL; + cc->cq = cq; cc->context = context; cc->hit_zero = fi_opx_hit_zero; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, - context, cc, 1, result, compare!=NULL, compare, opx_ep->tx->op_flags, - lock_required, av_type, caps, reliability); + cc, 1, result, compare!=NULL, compare, opx_ep->tx->op_flags, + lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); return 0; } + ssize_t fi_opx_fetch_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, NULL, NULL, result, result_desc, dest_addr, addr, key, datatype, op, context, lock_required, av_type, caps, - reliability); + reliability, + hfi1_type); } ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, @@ -880,12 +937,13 @@ ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, key, datatype, op, context, lock_required, av_type, - caps, reliability); + caps, reliability, hfi1_type); } ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, @@ -893,7 +951,8 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t enum fi_datatype datatype, enum fi_op op, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -915,11 +974,16 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; @@ -935,9 +999,9 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t const uint64_t is_hmem = (const uint64_t) fi_opx_hmem_iov_init(buf, count * sizeofdt(datatype), NULL, &iov); - fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr, key, NULL, cc, + fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr, key, cc, datatype, op, opx_ep->tx->op_flags | FI_INJECT, - is_hmem, lock_required, caps, reliability); + is_hmem, lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC INJECT WRITE (end)\n"); @@ -960,10 +1024,27 @@ ssize_t fi_opx_atomic(struct fid_ep *ep, const void *buf, size_t count, void *de ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, - context, FI_OPX_LOCK_NOT_REQUIRED, - opx_ep->av_type, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -987,16 +1068,47 @@ ssize_t fi_opx_fetch_atomic(struct fid_ep *ep, const void *buf, size_t count, vo fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_fetch_atomic_generic( - ep, buf, count, desc, result, result_desc, dest_addr, addr, key, - datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } } else { - rc = fi_opx_fetch_atomic_generic( - ep, buf, count, desc, result, result_desc, dest_addr, addr, key, - datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1022,16 +1134,47 @@ ssize_t fi_opx_compare_atomic(struct fid_ep *ep, const void *buf, size_t count, fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_compare_atomic_generic( - ep, buf, count, desc, compare, compare_desc, result, result_desc, - dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } } else { - rc = fi_opx_compare_atomic_generic( - ep, buf, count, desc, compare, compare_desc, result, result_desc, - dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1055,17 +1198,55 @@ ssize_t fi_opx_inject_atomic(struct fid_ep *ep, const void *buf, size_t count, f fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, - datatype, op, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, - datatype, op, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1093,16 +1274,53 @@ ssize_t fi_opx_atomic_writemsg(struct fid_ep *ep, const struct fi_msg_atomic *ms fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, - 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, - 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1126,20 +1344,63 @@ ssize_t fi_opx_atomic_readwritemsg(struct fid_ep *ep, const struct fi_msg_atomic fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, - flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, - 0x0018000000000000ull, - OPX_RELIABILITY); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, - flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, - 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EINVAL; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EINVAL\n"); + abort(); } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1162,18 +1423,59 @@ ssize_t fi_opx_atomic_compwritemsg(struct fid_ep *ep, const struct fi_msg_atomic fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, - resultv, result_count, flags, - FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, - resultv, result_count, flags, - FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1333,24 +1635,58 @@ int fi_opx_finalize_atomic_ops(struct fid_ep *ep) return 0; } -FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) ssize_t fi_opx_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)( - ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, count, dest_addr, addr, key, datatype, op); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_fetch_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, @@ -1358,11 +1694,31 @@ ssize_t fi_opx_fetch_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, si fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, count, desc, result, - result_desc, dest_addr, addr, - key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_compare_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, @@ -1371,8 +1727,26 @@ ssize_t fi_opx_compare_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, - key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_cntr.c b/prov/opx/src/fi_opx_cntr.c index 04bde0cb991..fa628fc6209 100644 --- a/prov/opx/src/fi_opx_cntr.c +++ b/prov/opx/src/fi_opx_cntr.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -79,12 +79,12 @@ static uint64_t fi_opx_cntr_read(struct fid_cntr *cntr) if (OFI_UNLIKELY(opx_cntr->lock_required)) { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); fi_opx_unlock(&opx_cntr->progress.ep[i]->lock); } } else { for (i=0; iprogress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); } } } @@ -153,14 +153,16 @@ fi_opx_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout) fi_opx_lock(&opx_cntr->progress.ep[i]->lock); fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, + OPX_HFI1_TYPE); fi_opx_unlock(&opx_cntr->progress.ep[i]->lock); } } else { for (i=0; iprogress.ep[i]->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, + OPX_HFI1_TYPE); } } } diff --git a/prov/opx/src/fi_opx_cq.c b/prov/opx/src/fi_opx_cq.c index 364b3b64d48..7b831d2ffd9 100644 --- a/prov/opx/src/fi_opx_cq.c +++ b/prov/opx/src/fi_opx_cq.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,18 +47,19 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { char *s = str; size_t len = 2047; int n = 0; - union fi_opx_context * context = NULL;; + struct opx_context *context = NULL;; struct fi_opx_cq *opx_cq = (struct fi_opx_cq *)cq; if (!func) func = "undef"; - n = snprintf(s, len, "%s():%d [%p] completed(%p,%p)", func, line, opx_cq, opx_cq->completed.head, opx_cq->completed.tail); + n = snprintf(s, len, "%s():%d [%p] completed(%p,%p)", func, line, + opx_cq, opx_cq->completed.head, opx_cq->completed.tail); s += n; len -= n; if (opx_cq->completed.head != NULL) { - context = opx_cq->completed.head; + context = (struct opx_context *) opx_cq->completed.head; n = snprintf(s, len, " = { %p", context); s += n; len -= n; context = context->next; @@ -73,7 +74,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] pending(%p,%p)", func, line, opx_cq, opx_cq->pending.head, opx_cq->pending.tail); s += n; len -= n; if (opx_cq->pending.head != NULL) { - context = opx_cq->pending.head; + context = (struct opx_context *) opx_cq->pending.head; n = snprintf(s, len, " = { %p(%lu,0x%016lx)", context, context->byte_counter, context->byte_counter); s += n; len -= n; context = context->next; @@ -89,7 +90,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] err(%p,%p)", func, line, opx_cq, opx_cq->err.head, opx_cq->err.tail); s += n; len -= n; if (opx_cq->err.head != NULL) { - context = opx_cq->err.head; + context = (struct opx_context *) opx_cq->err.head; n = snprintf(s, len, " = { %p(%lu)", context, context->byte_counter); s += n; len -= n; context = context->next; @@ -142,7 +143,7 @@ static int fi_opx_close_cq(fid_t fid) free(opx_cq); opx_cq = NULL; - //opx_cq (the object passed in as fid) is now unusable + //opx_cq (the object passed in as fid) is now unusable FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "cq closed\n"); return 0; @@ -185,15 +186,13 @@ static struct fi_ops fi_opx_fi_ops = { .ops_open = fi_opx_ops_open_cq }; -int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, - struct fi_opx_context_ext * ext, +int fi_opx_cq_enqueue_err (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { - assert(ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT); /* DEBUG */ assert(!lock_required); - ext->opx_context.next = NULL; - fi_opx_context_slist_insert_tail((union fi_opx_context *)ext, &opx_cq->err); + slist_insert_tail((struct slist_entry *) context, &opx_cq->err); return 0; } @@ -203,7 +202,8 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t rcvhdrcnt, const uint64_t caps, - const enum fi_progress progress) + const enum fi_progress progress, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "(called)\n"); @@ -222,19 +222,51 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, abort(); } - const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - - switch(rcvhdrcnt) { - case 2048: - return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps); - case 8192: - return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps); - default: - FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); - return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps); + const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); + + if (hfi1_type & OPX_HFI1_WFR) { + + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 0); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 0); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 0); + } + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 1); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 1); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 1); + } + } else if (hfi1_type & OPX_HFI1_JKR) { + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 2); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 2); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 2); + } + } else { + FI_WARN(fi_opx_global.prov, FI_LOG_CQ, "Invalid HFI type %d\n", hfi1_type); + return NULL; } } @@ -274,9 +306,9 @@ int fi_opx_cq_open(struct fid_domain *dom, opx_cq->format = attr->format ? attr->format : FI_CQ_FORMAT_CONTEXT; - fi_opx_context_slist_init(&opx_cq->pending); - fi_opx_context_slist_init(&opx_cq->completed); - fi_opx_context_slist_init(&opx_cq->err); + slist_init(&opx_cq->pending); + slist_init(&opx_cq->completed); + slist_init(&opx_cq->err); opx_cq->ep_bind_count = 0; opx_cq->progress.ep_count = 0; @@ -370,7 +402,8 @@ void fi_opx_cq_finalize_ops(struct fid_ep *ep) fi_opx_select_reliability(opx_ep), opx_ep->hfi->info.rxe.hdrq.elemcnt, opx_cq->ep_comm_caps, - opx_cq->domain->data_progress); + opx_cq->domain->data_progress, + OPX_HFI1_TYPE); } if (opx_ep->tx->cq && (opx_ep->tx->cq != opx_ep->rx->cq)) { @@ -381,7 +414,8 @@ void fi_opx_cq_finalize_ops(struct fid_ep *ep) fi_opx_select_reliability(opx_ep), opx_ep->hfi->info.rxe.hdrq.elemcnt, opx_cq->ep_comm_caps, - opx_cq->domain->data_progress); + opx_cq->domain->data_progress, + OPX_HFI1_TYPE); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "(end)\n"); diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking.c b/prov/opx/src/fi_opx_cq_ops_table_locking.c index 4efedeff078..ffadda5d946 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,301 +32,202 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = 2k value (2047 * 0x20) */ -/* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -static struct fi_ops_cq fi_opx_cq_locking_2048_ops_table[] = { - - // Format: FI_CQ_FORMAT_UNSPEC - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_CONTEXT - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_MSG - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) - //Format: FI_CQ_FORMAT_DATA +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), +static struct fi_ops_cq fi_opx_cq_locking_2048_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - // Format: FI_CQ_FORMAT_TAGGED + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_2048_ops = (op_matrix_t *)&fi_opx_cq_locking_2048_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_2048_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_2048_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_2048_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c index 88b94f433a0..41cf35f3aba 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,257 +32,202 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = 8k value (8191 * 0x20) */ -/* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - - -static struct fi_ops_cq fi_opx_cq_locking_8192_ops_table[] = { - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), +static struct fi_ops_cq fi_opx_cq_locking_8192_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_8192_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_8192_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c index 94bcd635c3f..7377ef549d0 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,235 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ + +/* WFR 9B headers */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) +static struct fi_ops_cq fi_opx_cq_locking_runtime_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), -/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), -/* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), -static struct fi_ops_cq fi_opx_cq_locking_runtime_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_runtime_ops = (op_matrix_t *)&fi_opx_cq_locking_runtime_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_runtime_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_runtime_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c index 6fcd46a0057..5caaaefe9a9 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,302 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = 2k value (2047 * 0x20) */ -/* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - - - -static struct fi_ops_cq fi_opx_cq_non_locking_2048_ops_table[] = { - - // Format: FI_CQ_FORMAT_UNSPEC - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_CONTEXT - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_MSG - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) - //Format: FI_CQ_FORMAT_DATA +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), +static struct fi_ops_cq fi_opx_cq_non_locking_2048_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - // Format: FI_CQ_FORMAT_TAGGED + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), }; - static op_matrix_t *fi_opx_cq_non_locking_2048_ops = (op_matrix_t *)&fi_opx_cq_non_locking_2048_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_2048_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_2048_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c index 8b04675c2c7..ee4e94483ff 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,234 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = 8k value (8191 * 0x20) */ + +/* WFR 9B headers */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) +static struct fi_ops_cq fi_opx_cq_non_locking_8192_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), -/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), -/* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), -static struct fi_ops_cq fi_opx_cq_non_locking_8192_ops_table[] = { - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_non_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_non_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_8192_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_8192_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c index 681ee867953..8efa55b2e3b 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,262 +32,281 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ + +/* WFR 9B headers */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) +static struct fi_ops_cq fi_opx_cq_non_locking_runtime_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), -/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), -/* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), -static struct fi_ops_cq fi_opx_cq_non_locking_runtime_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), }; ssize_t fi_opx_cq_read_FABRIC_DIRECT(struct fid_cq *cq, void *buf, size_t count) { - return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, - OPX_CQ_FORMAT, - OPX_LOCK, - OPX_RELIABILITY, - OPX_MASK, - OPX_CQ_CAPS) + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_WFR) + (cq, buf, count); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR_9B) (cq, buf, count); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR) + (cq, buf, count); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_cq_readfrom_FABRIC_DIRECT(struct fid_cq *cq, void *buf, size_t count, fi_addr_t *src_addr) { - return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, - OPX_CQ_FORMAT, - OPX_LOCK, - OPX_RELIABILITY, - OPX_MASK, - OPX_CQ_CAPS) + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_WFR) + (cq, buf, count, src_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR_9B) + (cq, buf, count, src_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR) (cq, buf, count, src_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } static op_matrix_t *fi_opx_cq_non_locking_runtime_ops = (op_matrix_t *)&fi_opx_cq_non_locking_runtime_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_runtime_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_runtime_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_domain.c b/prov/opx/src/fi_opx_domain.c index 6a0f7cbd291..cffe97b40b6 100644 --- a/prov/opx/src/fi_opx_domain.c +++ b/prov/opx/src/fi_opx_domain.c @@ -492,7 +492,7 @@ int fi_opx_domain(struct fid_fabric *fabric, strncpy(opx_domain->unique_job_key_str, env_var_uuid, OPX_JOB_KEY_STR_SIZE-1); opx_domain->unique_job_key_str[OPX_JOB_KEY_STR_SIZE-1] = '\0'; - sscanf(opx_domain->unique_job_key_str, "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", + int elements_read = sscanf(opx_domain->unique_job_key_str, "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", &opx_domain->unique_job_key[0], &opx_domain->unique_job_key[1], &opx_domain->unique_job_key[2], @@ -509,6 +509,11 @@ int fi_opx_domain(struct fid_fabric *fabric, &opx_domain->unique_job_key[13], &opx_domain->unique_job_key[14], &opx_domain->unique_job_key[15]); + if (elements_read == EOF) { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error: sscanf encountered an input failure (EOF), unable to parse the unique job key string.\n"); + errno = FI_EINVAL; + goto err; + } FI_INFO(fi_opx_global.prov, FI_LOG_DOMAIN, "Domain unique job key set to %s\n", opx_domain->unique_job_key_str); //TODO: Print out a summary of all domain settings wtih FI_INFO diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 08c5895ac60..eebd26186a5 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -57,8 +57,8 @@ #include "rdma/opx/fi_opx_fabric.h" #define FI_OPX_EP_RX_UEPKT_BLOCKSIZE (256) -#define FI_OPX_EP_RX_CTX_EXT_BLOCKSIZE (2048) #define FI_OPX_VER_CHECK_BUF_LEN (512) +#define OPX_EP_RX_CTX_BLOCKSIZE (2048) #define OPX_MODINFO_PATH "/sbin/modinfo" #define OPX_MODINFO_DRV_VERS OPX_MODINFO_PATH " hfi1 -F version" #define OPX_MODINFO_SRC_VERS OPX_MODINFO_PATH " hfi1 -F srcversion" @@ -212,65 +212,69 @@ static struct fi_ops_ep fi_opx_stx_ep_ops = { void fi_opx_ep_tx_model_init (struct fi_opx_hfi1_context * hfi, const uint8_t reliability_rx, - struct fi_opx_hfi1_txe_scb * inject, - struct fi_opx_hfi1_txe_scb * send, - struct fi_opx_hfi1_txe_scb * rendezvous) { + struct fi_opx_hfi1_txe_scb_9B * inject_9B, + struct fi_opx_hfi1_txe_scb_9B * send_9B, + struct fi_opx_hfi1_txe_scb_9B * rendezvous_9B) { /* * fi_send*() model - eager */ - + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; /* PBC data */ - memset(send, 0, sizeof(*send)); - memset(inject, 0, sizeof(*inject)); - memset(rendezvous, 0, sizeof(*rendezvous)); - send->qw0 = OPX_PBC_LEN(0) /* length_dws */ | - OPX_PBC_VL(hfi->vl) | - OPX_PBC_SC(hfi->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi->hfi_port) | - OPX_PBC_SCTXT(hfi->send_ctxt); + memset(send_9B, 0, sizeof(*send_9B)); + memset(inject_9B, 0, sizeof(*inject_9B)); + memset(rendezvous_9B, 0, sizeof(*rendezvous_9B)); + send_9B->qw0 = OPX_PBC_LEN(0,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type) | + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type); /* LRH header */ - send->hdr.stl.lrh.flags = + send_9B->hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - send->hdr.stl.lrh.dlid = 0; /* set at runtime */ - send->hdr.stl.lrh.pktlen = 0; /* set at runtime */ - send->hdr.stl.lrh.slid = htons(hfi->lid); + send_9B->hdr.lrh_9B.dlid = 0; /* set at runtime */ + send_9B->hdr.lrh_9B.pktlen = 0; /* set at runtime */ + send_9B->hdr.lrh_9B.slid = htons((uint16_t)hfi->lid); /* BTH header */ - send->hdr.stl.bth.opcode = 0; - send->hdr.stl.bth.bth_1 = 0; - send->hdr.stl.bth.pkey = htons(hfi->pkey); - send->hdr.stl.bth.ecn = (uint8_t)(OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - send->hdr.stl.bth.qp = hfi->bthqp; - send->hdr.stl.bth.unused = 0; - send->hdr.stl.bth.rx = 0; /* set at runtime */ + send_9B->hdr.bth.opcode = 0; + send_9B->hdr.bth.bth_1 = 0; + send_9B->hdr.bth.pkey = htons(hfi->pkey); + send_9B->hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)),hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT,hfi1_type)); + send_9B->hdr.bth.qp = hfi->bthqp; + send_9B->hdr.bth.unused = 0; + send_9B->hdr.bth.rx = 0; /* set at runtime */ - send->hdr.reliability.psn = 0; - send->hdr.reliability.origin_tx = hfi->send_ctxt; + send_9B->hdr.reliability.psn = 0; + send_9B->hdr.reliability.origin_tx = hfi->send_ctxt; /* KDETH header */ - send->hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ - send->hdr.stl.kdeth.jkey = hfi->jkey; - send->hdr.stl.kdeth.hcrc = 0; - send->hdr.stl.kdeth.unused = 0; + send_9B->hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + send_9B->hdr.kdeth.jkey = hfi->jkey; + send_9B->hdr.kdeth.hcrc = 0; + send_9B->hdr.kdeth.unused = 0; /* OFI header */ - send->hdr.match.ofi_data = 0; /* set at runtime */ - send->hdr.match.ofi_tag = 0; /* set at runtime */ + send_9B->hdr.match.ofi_data = 0; /* set at runtime */ + send_9B->hdr.match.ofi_tag = 0; /* set at runtime */ /* * fi_send*() model - rendezvous */ - *rendezvous = *send; - rendezvous->hdr.rendezvous.origin_rs = reliability_rx; + *rendezvous_9B = *send_9B; + rendezvous_9B->hdr.rendezvous.origin_rs = reliability_rx; + + /* clone from send model, then adjust */ + *inject_9B = *send_9B; /* * fi_inject() model @@ -281,24 +285,127 @@ void fi_opx_ep_tx_model_init (struct fi_opx_hfi1_context * hfi, 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - inject->qw0 = OPX_PBC_LEN(inject_pbc_dws) /* length_dws */ | - OPX_PBC_VL(hfi->vl) | - OPX_PBC_SC(hfi->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0)| - OPX_PBC_PORTIDX(hfi->hfi_port) | - OPX_PBC_SCTXT(hfi->send_ctxt); + inject_9B->qw0 = OPX_PBC_LEN(inject_pbc_dws,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type)| + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type); + + inject_9B->hdr.lrh_9B.pktlen = htons(inject_pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + + /* specified at runtime */ + inject_9B->hdr.inject.message_length = 0; + inject_9B->hdr.inject.app_data_u64[0] = 0; + inject_9B->hdr.inject.app_data_u64[1] = 0; +} + +void fi_opx_ep_tx_model_init_16B (struct fi_opx_hfi1_context * hfi, + const uint8_t reliability_rx, + struct fi_opx_hfi1_txe_scb_16B * inject_16B, + struct fi_opx_hfi1_txe_scb_16B * send_16B, + struct fi_opx_hfi1_txe_scb_16B * rendezvous_16B) { + + /* + * fi_send*() model - eager + */ + /* Setup the 16B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = OPX_HFI1_JKR; + + /* PBC data */ + memset(send_16B, 0, sizeof(*send_16B)); + memset(inject_16B, 0, sizeof(*inject_16B)); + memset(rendezvous_16B, 0, sizeof(*rendezvous_16B)); + send_16B->qw0 = OPX_PBC_LEN(0,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type) | + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; + + /* LRH header */ + send_16B->hdr.lrh_16B.qw[0] = 0UL; + send_16B->hdr.lrh_16B.qw[1] = 0UL; + + send_16B->hdr.lrh_16B.sc = hfi->sc; + send_16B->hdr.lrh_16B.entropy = 0; + send_16B->hdr.lrh_16B.lt = 0; // need to add env variable to change + send_16B->hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + send_16B->hdr.lrh_16B.l4 = 9; + send_16B->hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + send_16B->hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + send_16B->hdr.lrh_16B.pkey = hfi->pkey; + + send_16B->hdr.lrh_16B.slid = hfi->lid & 0xFFFFF; + send_16B->hdr.lrh_16B.slid20 = (hfi->lid) >> 20; + + /* BTH header */ + send_16B->hdr.bth.opcode = 0; + send_16B->hdr.bth.bth_1 = 0; + send_16B->hdr.bth.pkey = htons(hfi->pkey); + send_16B->hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)),hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT,hfi1_type)); + send_16B->hdr.bth.qp = hfi->bthqp; + send_16B->hdr.bth.unused = 0; + send_16B->hdr.bth.rx = 0; /* set at runtime */ + + send_16B->hdr.reliability.psn = 0; + send_16B->hdr.reliability.origin_tx = hfi->send_ctxt; + + /* KDETH header */ + send_16B->hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + send_16B->hdr.kdeth.jkey = hfi->jkey; + send_16B->hdr.kdeth.hcrc = 0; + send_16B->hdr.kdeth.unused = 0; + + /* OFI header */ + send_16B->hdr.match.ofi_data = 0; /* set at runtime */ + send_16B->hdr.match.ofi_tag = 0; /* set at runtime */ + + + /* + * fi_send*() model - rendezvous + */ + *rendezvous_16B = *send_16B; + rendezvous_16B->hdr.rendezvous.origin_rs = reliability_rx; + + + /* + * fi_inject() model + */ /* clone from send model, then adjust */ - inject->hdr = send->hdr; + *inject_16B = *send_16B; + + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 ; /* ICRC/tail */ + + inject_16B->qw0 = OPX_PBC_LEN(pbc_dws,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type)| + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - inject->hdr.stl.lrh.pktlen = htons(inject_pbc_dws-1); + /* (LRH QW) does not include pbc (8 bytes) */ + const uint32_t packetLength = (pbc_dws - 2) * 4; + const uint32_t lrh_qws = (packetLength >> 3) + + ((packetLength & 0x07u) != 0); + + + inject_16B->hdr.lrh_16B.pktlen = lrh_qws; /* specified at runtime */ - inject->hdr.inject.message_length = 0; - inject->hdr.inject.app_data_u64[0] = 0; - inject->hdr.inject.app_data_u64[1] = 0; + inject_16B->hdr.inject.message_length = 0; + inject_16B->hdr.inject.app_data_u64[0] = 0; } int fi_opx_stx_init (struct fi_opx_domain *opx_domain, struct fi_tx_attr *attr, @@ -354,6 +461,12 @@ int fi_opx_stx_init (struct fi_opx_domain *opx_domain, struct fi_tx_attr *attr, &opx_stx->tx.send, &opx_stx->tx.rzv); + fi_opx_ep_tx_model_init_16B(opx_stx->hfi, + opx_stx->reliability_rx, + &opx_stx->tx.inject_16B, + &opx_stx->tx.send_16B, + &opx_stx->tx.rzv_16B); + fi_opx_ref_inc(&opx_domain->ref_cnt, "domain"); fi_opx_ref_init(&opx_stx->ref_cnt, "shared transmit context"); @@ -432,6 +545,31 @@ static void fi_opx_unbind_cq_ep(struct fi_opx_cq *cq, struct fi_opx_ep *ep) } +__OPX_FORCE_INLINE__ +int opx_ep_free_match_queue_list_contexts(struct slist *list) +{ + int count = 0; + + while (!slist_empty(list)) { + struct opx_context *context = (struct opx_context *) slist_remove_head(list); + OPX_BUF_FREE(context); + ++count; + } + + return count; +} + +__OPX_FORCE_INLINE__ +void opx_ep_free_match_queued_contexts(struct fi_opx_ep *opx_ep) +{ + int tag_count = opx_ep_free_match_queue_list_contexts(&opx_ep->rx->queue[0].mq); + int msg_count = opx_ep_free_match_queue_list_contexts(&opx_ep->rx->queue[1].mq); + + FI_LOG(fi_opx_global.prov, FI_LOG_DEBUG, FI_LOG_FABRIC, + "Freed %d contexts from tag match queue, %d contexts from msg match queue\n", + tag_count, msg_count); +} + static int fi_opx_close_ep(fid_t fid) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "close ep\n"); @@ -480,7 +618,7 @@ static int fi_opx_close_ep(fid_t fid) fi_reliability_service_ping_remote(&opx_ep->ep_fid, service); service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); } - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); compare = fi_opx_timer_now(timestamp, timer); } } @@ -537,6 +675,7 @@ static int fi_opx_close_ep(fid_t fid) } } if (opx_ep->rx && (opx_ep->rx->cq && (fid->fclass == FI_CLASS_EP || fid->fclass == FI_CLASS_RX_CTX))) { + opx_ep_free_match_queued_contexts(opx_ep); ret = fi_opx_ref_dec(&opx_ep->rx->cq->ref_cnt, "completion queue"); if (ret) { errno = -ret; @@ -593,9 +732,9 @@ static int fi_opx_close_ep(fid_t fid) if (opx_ep->rx->match_ue_tag_hash) { fi_opx_match_ue_hash_free(&opx_ep->rx->match_ue_tag_hash); } - if (opx_ep->rx->ctx_ext_pool) { - ofi_bufpool_destroy(opx_ep->rx->ctx_ext_pool); - opx_ep->rx->ctx_ext_pool = NULL; + if (opx_ep->rx->ctx_pool) { + ofi_bufpool_destroy(opx_ep->rx->ctx_pool); + opx_ep->rx->ctx_pool = NULL; } free(opx_ep->rx->mem); } @@ -837,14 +976,21 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, /* initialize the models */ fi_opx_ep_tx_model_init(hfi, opx_ep->reliability->rx, - &opx_ep->tx->inject, - &opx_ep->tx->send, - &opx_ep->tx->rzv); + &opx_ep->tx->inject_9B, + &opx_ep->tx->send_9B, + &opx_ep->tx->rzv_9B); - opx_ep->tx->inject.hdr.reliability.unused = 0; - opx_ep->tx->rzv.hdr.reliability.unused = 0; + fi_opx_ep_tx_model_init_16B(hfi, + opx_ep->reliability->rx, + &opx_ep->tx->inject_16B, + &opx_ep->tx->send_16B, + &opx_ep->tx->rzv_16B); + + opx_ep->tx->inject_9B.hdr.reliability.unused = 0; + opx_ep->tx->rzv_9B.hdr.reliability.unused = 0; - opx_ep->tx->rzv.hdr.rendezvous.origin_rx = hfi->info.rxe.id; + opx_ep->tx->rzv_9B.hdr.rendezvous.origin_rx = hfi->info.rxe.id; + opx_ep->tx->rzv_16B.hdr.rendezvous.origin_rx = hfi->info.rxe.id; // these 3 lines should move to ep init ? opx_ep->threading = (uint32_t) opx_domain->threading; @@ -964,7 +1110,8 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, if (fi_param_get_int(fi_opx_global.prov, "sdma_disable", &sdma_disable) == FI_SUCCESS) { opx_ep->tx->use_sdma = !sdma_disable; OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, - "sdma_disable parm specified as %0X; opx_ep->tx->use_sdma set to %0hhX\n", sdma_disable, opx_ep->tx->use_sdma); + "sdma_disable parm specified as %0X; opx_ep->tx->use_sdma set to %0hhX\n", + sdma_disable, opx_ep->tx->use_sdma); } else { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "sdma_disable parm not specified; using SDMA\n"); opx_ep->tx->use_sdma = 1; @@ -975,17 +1122,40 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, rc = fi_param_get_int(fi_opx_global.prov, "sdma_min_payload_bytes", &l_sdma_min_payload_bytes); if (rc != FI_SUCCESS) { opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", - opx_ep->tx->sdma_min_payload_bytes); - } else if (l_sdma_min_payload_bytes < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES || l_sdma_min_payload_bytes > INT_MAX) { + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } else if (l_sdma_min_payload_bytes < FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN || + l_sdma_min_payload_bytes > FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX) { opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Error: FI_OPX_SDMA_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n", - FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES, INT_MAX, opx_ep->tx->sdma_min_payload_bytes); + FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX, + FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT); } else { opx_ep->tx->sdma_min_payload_bytes = l_sdma_min_payload_bytes; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n", - opx_ep->tx->sdma_min_payload_bytes); + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } + + int l_tid_min_payload_bytes; + rc = fi_param_get_int(fi_opx_global.prov, "tid_min_payload_bytes", &l_tid_min_payload_bytes); + if (rc != FI_SUCCESS) { + opx_ep->tx->tid_min_payload_bytes = OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_TID_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + opx_ep->tx->tid_min_payload_bytes); + } else if (l_tid_min_payload_bytes < OPX_TID_MIN_PAYLOAD_BYTES_MIN) { + opx_ep->tx->tid_min_payload_bytes = OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Error: FI_OPX_TID_MIN_PAYLOAD_BYTES was set but is less than minimum allowed (%lu). Using default setting of %d\n", + OPX_TID_MIN_PAYLOAD_BYTES_MIN, OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT); + } else { + opx_ep->tx->tid_min_payload_bytes = l_tid_min_payload_bytes; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_TID_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + opx_ep->tx->tid_min_payload_bytes); } slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SHM]); @@ -1042,10 +1212,10 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) goto err; } - opx_ep->rx->ctx_ext_pool = NULL; - if (ofi_bufpool_create(&opx_ep->rx->ctx_ext_pool, - sizeof(struct fi_opx_context_ext), - 8, UINT_MAX, FI_OPX_EP_RX_CTX_EXT_BLOCKSIZE, 0)) { + opx_ep->rx->ctx_pool = NULL; + if (ofi_bufpool_create(&opx_ep->rx->ctx_pool, + sizeof(struct opx_context), + 64, UINT_MAX, OPX_EP_RX_CTX_BLOCKSIZE, 0)) { goto err; } struct fi_opx_domain * opx_domain = opx_ep->domain; @@ -1069,7 +1239,7 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) opx_ep->rx->egrq.head_register = hfi1->info.rxe.egrq.head_register; opx_ep->rx->self.raw64b = 0; - opx_ep->rx->self.uid.lid = htons(hfi1->lid); + opx_ep->rx->self.uid.lid = htons(hfi1->lid); // lid needs to be changed to uint32 opx_ep->rx->self.hfi1_rx = hfi1->info.rxe.id; opx_ep->rx->self.hfi1_unit = (uint8_t)hfi1->hfi_unit; opx_ep->rx->self.uid.endpoint_id = hfi1->send_ctxt; @@ -1081,80 +1251,153 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) /* Initialize hash table used to lookup info on any HFI units on the node */ fi_opx_global.hfi_local_info.hfi_unit = (uint8_t)hfi1->hfi_unit; fi_opx_global.hfi_local_info.lid = htons(hfi1->lid); - fi_opx_global.hfi_local_info.type = opx_ep->hfi->hfi_hfi1_type; - if(fi_opx_global.hfi_local_info.type != OPX_HFI1_TYPE) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Runtime HFI type (%u) doesn't match build type (%u)\n", - fi_opx_global.hfi_local_info.type, OPX_HFI1_TYPE); - abort(); - } + fi_opx_init_hfi_lookup(); + /* * initialize tx for acks, etc */ { /* rendezvous CTS packet model */ - memset(&opx_ep->rx->tx.cts, 0, sizeof(opx_ep->rx->tx.cts)); + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; + + memset(&opx_ep->rx->tx.cts_9B, 0, sizeof(opx_ep->rx->tx.cts_9B)); /* PBC data */ - opx_ep->rx->tx.cts.qw0 = OPX_PBC_LEN(0) /* length_dws */ | - OPX_PBC_VL(hfi1->vl) | - OPX_PBC_SC(hfi1->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi1->hfi_port) | - OPX_PBC_SCTXT(hfi1->send_ctxt); + opx_ep->rx->tx.cts_9B.qw0 = OPX_PBC_LEN(0, hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type); /* LRH header */ - opx_ep->rx->tx.cts.hdr.stl.lrh.flags = + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi1->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi1->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - opx_ep->rx->tx.cts.hdr.stl.lrh.dlid = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.stl.lrh.pktlen = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.stl.lrh.slid = htons(hfi1->lid); + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.dlid = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.pktlen = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.slid = htons(hfi1->lid); /* BTH header */ - opx_ep->rx->tx.cts.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; - opx_ep->rx->tx.cts.hdr.stl.bth.bth_1 = 0; - opx_ep->rx->tx.cts.hdr.stl.bth.pkey = htons(hfi1->pkey); - opx_ep->rx->tx.cts.hdr.stl.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - opx_ep->rx->tx.cts.hdr.stl.bth.qp = hfi1->bthqp; - opx_ep->rx->tx.cts.hdr.stl.bth.unused = 0; - opx_ep->rx->tx.cts.hdr.stl.bth.rx = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; + opx_ep->rx->tx.cts_9B.hdr.bth.bth_1 = 0; + opx_ep->rx->tx.cts_9B.hdr.bth.pkey = htons(hfi1->pkey); + opx_ep->rx->tx.cts_9B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + opx_ep->rx->tx.cts_9B.hdr.bth.qp = hfi1->bthqp; + opx_ep->rx->tx.cts_9B.hdr.bth.unused = 0; + opx_ep->rx->tx.cts_9B.hdr.bth.rx = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.reliability.psn = 0; - opx_ep->rx->tx.cts.hdr.reliability.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.cts_9B.hdr.reliability.psn = 0; + opx_ep->rx->tx.cts_9B.hdr.reliability.origin_tx = hfi1->send_ctxt; /* KDETH header */ - opx_ep->rx->tx.cts.hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ - opx_ep->rx->tx.cts.hdr.stl.kdeth.jkey = hfi1->jkey; - opx_ep->rx->tx.cts.hdr.stl.kdeth.hcrc = 0; - opx_ep->rx->tx.cts.hdr.stl.kdeth.unused = 0; + opx_ep->rx->tx.cts_9B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + opx_ep->rx->tx.cts_9B.hdr.kdeth.jkey = hfi1->jkey; + opx_ep->rx->tx.cts_9B.hdr.kdeth.hcrc = 0; + opx_ep->rx->tx.cts_9B.hdr.kdeth.unused = 0; /* OFI header */ - opx_ep->rx->tx.cts.hdr.cts.origin_rx = hfi1->info.rxe.id; - opx_ep->rx->tx.cts.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; + opx_ep->rx->tx.cts_9B.hdr.cts.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.cts_9B.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; } { /* rendezvous DPUT packet model */ /* tagged model */ - memset(&opx_ep->rx->tx.dput, 0, - sizeof(opx_ep->rx->tx.dput)); + memset(&opx_ep->rx->tx.dput_9B, 0, + sizeof(opx_ep->rx->tx.dput_9B)); + + opx_ep->rx->tx.dput_9B = opx_ep->rx->tx.cts_9B; + opx_ep->rx->tx.dput_9B.hdr.reliability.origin_tx = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.dput_9B.hdr.dput.target.dt = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.op = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.last_bytes = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.bytes = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.dput_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; + } + + { /* rendezvous CTS packet model for 16B*/ + /* Setup the 16B models whether or not they'll be used */ + + uint64_t hfi1_type = OPX_HFI1_JKR; + + memset(&opx_ep->rx->tx.cts_16B, 0, sizeof(opx_ep->rx->tx.cts_16B)); + /* PBC data */ + opx_ep->rx->tx.cts_16B.qw0 = OPX_PBC_LEN(0, hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; - opx_ep->rx->tx.dput = opx_ep->rx->tx.cts; - opx_ep->rx->tx.dput.hdr.reliability.origin_tx = 0; - opx_ep->rx->tx.dput.hdr.dput.target.origin_tx = hfi1->send_ctxt; - opx_ep->rx->tx.dput.hdr.dput.target.dt = 0; - opx_ep->rx->tx.dput.hdr.dput.target.op = 0; - opx_ep->rx->tx.dput.hdr.dput.target.last_bytes = 0; - opx_ep->rx->tx.dput.hdr.dput.target.bytes = 0; - opx_ep->rx->tx.dput.hdr.dput.origin_rx = hfi1->info.rxe.id; - opx_ep->rx->tx.dput.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; + /* LRH header */ + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.qw[0] = 0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.qw[1] = 0; + + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.sc = hfi1->sc; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.entropy = 0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.lt = 0; // need to add env variable to change + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.l4 = 9; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.pkey = hfi1->pkey; + + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.slid = hfi1->lid & 0xFFFFF; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.slid20 = (hfi1->lid) >> 20; + + /* BTH header */ + opx_ep->rx->tx.cts_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; + opx_ep->rx->tx.cts_16B.hdr.bth.bth_1 = 0; + opx_ep->rx->tx.cts_16B.hdr.bth.pkey = htons(hfi1->pkey); + opx_ep->rx->tx.cts_16B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL(hfi1_type), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + opx_ep->rx->tx.cts_16B.hdr.bth.qp = hfi1->bthqp; + opx_ep->rx->tx.cts_16B.hdr.bth.unused = 0; + opx_ep->rx->tx.cts_16B.hdr.bth.rx = 0; /* set at runtime */ + + opx_ep->rx->tx.cts_16B.hdr.reliability.psn = 0; + opx_ep->rx->tx.cts_16B.hdr.reliability.origin_tx = hfi1->send_ctxt; + + /* KDETH header */ + opx_ep->rx->tx.cts_16B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + opx_ep->rx->tx.cts_16B.hdr.kdeth.jkey = hfi1->jkey; + opx_ep->rx->tx.cts_16B.hdr.kdeth.hcrc = 0; + opx_ep->rx->tx.cts_16B.hdr.kdeth.unused = 0; + + /* OFI header */ + opx_ep->rx->tx.cts_16B.hdr.cts.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.cts_16B.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; } + { /* rendezvous DPUT packet model */ + + /* tagged model */ + memset(&opx_ep->rx->tx.dput_16B, 0, + sizeof(opx_ep->rx->tx.dput_16B)); + + + opx_ep->rx->tx.dput_16B = opx_ep->rx->tx.cts_16B; + opx_ep->rx->tx.dput_16B.hdr.reliability.origin_tx = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.dput_16B.hdr.dput.target.dt = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.op = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.last_bytes = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.bytes = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.dput_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; + } + + if ((opx_ep->rx->caps & FI_LOCAL_COMM) || ((opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == 0)) { char buffer[128]; @@ -1171,9 +1414,13 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) snprintf(buffer,sizeof(buffer),"%s-%02x.%d", opx_domain->unique_job_key_str, hfi_unit, inst); - opx_shm_rx_init(&opx_ep->rx->shm, fi_opx_global.prov, + ssize_t rc = opx_shm_rx_init(&opx_ep->rx->shm, fi_opx_global.prov, (const char *)buffer, rx_index, FI_OPX_SHM_FIFO_SIZE, FI_OPX_SHM_PACKET_SIZE); + if (OFI_UNLIKELY(rc != FI_SUCCESS)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Shared memory initialization failed.\n"); + goto err; + } } /* Now that endpoint is complete enough to have context information from the hfi, @@ -1191,9 +1438,9 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) fi_opx_match_ue_hash_free(&opx_ep->rx->match_ue_tag_hash); - if (opx_ep->rx->ctx_ext_pool) { - ofi_bufpool_destroy(opx_ep->rx->ctx_ext_pool); - opx_ep->rx->ctx_ext_pool = NULL; + if (opx_ep->rx->ctx_pool) { + ofi_bufpool_destroy(opx_ep->rx->ctx_pool); + opx_ep->rx->ctx_pool = NULL; } return -FI_ENOMEM; @@ -1224,28 +1471,23 @@ static int fi_opx_apply_info_and_init_ops(struct fi_opx_ep *opx_ep) { opx_ep->rx->op_flags |= info->rx_attr ? info->rx_attr->op_flags : 0; // Init oprations per endpoint - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); int ret; ret = fi_opx_init_cm_ops(&opx_ep->ep_fid.fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_msg_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_rma_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_tagged_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_atomic_ops(&opx_ep->ep_fid, info); if (ret) goto err; @@ -1326,7 +1568,6 @@ static void fi_opx_apply_bind_flags(struct fi_opx_ep *opx_ep) { opx_ep->is_rx_cq_bound = true; } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) @@ -1372,6 +1613,18 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) return -errno; } fi_opx_ref_inc(&opx_ep->hfi->ref_cnt, "HFI context"); + + fi_opx_global.hfi_local_info.type = opx_ep->hfi->hfi_hfi1_type; + + int mixed_network = 0; + if (fi_param_get_int(fi_opx_global.prov, "mixed_network", &mixed_network) == FI_SUCCESS) { + if ((mixed_network == 1) && (fi_opx_global.hfi_local_info.type == OPX_HFI1_JKR)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Runtime HFI type is 9B JKR\n"); + fi_opx_global.hfi_local_info.type = OPX_HFI1_JKR_9B; + opx_ep->hfi->hfi_hfi1_type = OPX_HFI1_JKR_9B; + } + } + FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, "Opened hfi %p, HFI type %#X/%#X, unit %#X, port %#X, ref_cnt %#lX," " rcv ctxt %#X, send ctxt %#X, \n", @@ -1381,12 +1634,13 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) opx_ep->hfi->ctrl->ctxt_info.ctxt, opx_ep->hfi->ctrl->ctxt_info.send_ctxt); - if (OPX_HFI1_TYPE == OPX_HFI1_JKR) { + if (OPX_HFI1_TYPE & OPX_HFI1_JKR || OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "*****HFI type is JKR (CN5000)\n"); } else { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "*****HFI type is WFR (Omni-path)\n"); } + void *mem = NULL; mem = malloc(sizeof(struct fi_opx_ep_reliability) + FI_OPX_CACHE_LINE_SIZE); if (!mem) { @@ -1414,6 +1668,8 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) fi_opx_reliability_service_init(&opx_ep->reliability->service, opx_domain->unique_job_key, opx_ep->hfi, OFI_RELIABILITY_KIND_ONLOAD); + fi_opx_reliability_model_init_16B(&opx_ep->reliability->service, + opx_ep->hfi); opx_ep->reliability->rx = opx_ep->hfi->info.rxe.id; fi_opx_reliability_client_init(&opx_ep->reliability->state, &opx_ep->reliability->service, @@ -1480,7 +1736,7 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) fprintf(stderr, "%s:%s():%d bad structure alignment !\n", __FILE__, __func__, __LINE__); abort(); } - alignment_check = (uintptr_t)&opx_ep->tx->send; + alignment_check = (uintptr_t)&opx_ep->tx->send_9B; if ((alignment_check & 0x03Full) != 0) { fprintf(stderr, "%s:%s():%d bad structure alignment !\n", __FILE__, __func__, __LINE__); abort(); } @@ -1504,9 +1760,9 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) opx_ep->rx->mp_egr_queue.ue.tail = NULL; /* Context match queues (queue[0] == FI_TAGGED, queue[1] == FI_MSG) */ - fi_opx_context_slist_init(&opx_ep->rx->queue[0].mq); - fi_opx_context_slist_init(&opx_ep->rx->queue[1].mq); - fi_opx_context_slist_init(&opx_ep->rx->mp_egr_queue.mq); + slist_init(&opx_ep->rx->queue[0].mq); + slist_init(&opx_ep->rx->queue[1].mq); + slist_init(&opx_ep->rx->mp_egr_queue.mq); opx_ep->tx->cq = NULL; opx_ep->tx->cq_pending_ptr = NULL; @@ -1769,11 +2025,11 @@ static int fi_opx_setopt_ep(fid_t fid, int level, int optname, break; case FI_OPT_CUDA_API_PERMITTED: if (!hmem_ops[FI_HMEM_CUDA].initialized) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, - "Cannot set CUDA API permitted when" - "CUDA library or CUDA device is not available\n"); - return -FI_EINVAL; - } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, + "Cannot set CUDA API permitted when" + "CUDA library or CUDA device is not available\n"); + return -FI_EINVAL; + } /* our HMEM support does not make calls to CUDA API, * therefore we can accept any option for FI_OPT_CUDA_API_PERMITTED. */ @@ -1785,10 +2041,9 @@ static int fi_opx_setopt_ep(fid_t fid, int level, int optname, return 0; } - -int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, +int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx *rx, const uint64_t static_flags, - const union fi_opx_context * cancel_context, + const uintptr_t cancel_context, const int lock_required) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(begin)\n"); @@ -1800,54 +2055,36 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, * search the match queue for this context */ - union fi_opx_context * prev = NULL; - union fi_opx_context * item = rx->queue[kind].mq.head; + struct opx_context *prev = NULL; + struct opx_context *item = (struct opx_context *) rx->queue[kind].mq.head; while (item) { - const uint64_t is_context_ext = item->flags & FI_OPX_CQ_CONTEXT_EXT; - const uint64_t compare_context = is_context_ext ? - (uint64_t)(((struct fi_opx_context_ext *)item)->msg.op_context) : - (uint64_t)item; + const uintptr_t compare_context = (uintptr_t) item->err_entry.op_context; - if ((uintptr_t)cancel_context == compare_context) { + if (cancel_context == compare_context) { if (prev) prev->next = item->next; else - rx->queue[kind].mq.head = item->next; + rx->queue[kind].mq.head = (struct slist_entry *) item->next; if (!item->next) - rx->queue[kind].mq.tail = prev; - - struct fi_opx_context_ext * ext = NULL; - if (cancel_context->flags & FI_OPX_CQ_CONTEXT_EXT) { - ext = (struct fi_opx_context_ext *)cancel_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - } - - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; - ext->err_entry.op_context = (void *)cancel_context; - ext->err_entry.flags = cancel_context->flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = cancel_context->tag; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ECANCELED; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + rx->queue[kind].mq.tail = (struct slist_entry *) prev; + + item->byte_counter = 0; + item->next = NULL; + item->err_entry.flags = item->flags; + item->err_entry.len = 0; + item->err_entry.buf = 0; + item->err_entry.data = 0; + item->err_entry.tag = item->tag; + item->err_entry.olen = 0; + item->err_entry.err = FI_ECANCELED; + item->err_entry.prov_errno = 0; + item->err_entry.err_data = NULL; + item->err_entry.err_data_size = 0; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) item, rx->cq_err_ptr); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(end) canceled\n"); return FI_ECANCELED; @@ -1875,7 +2112,7 @@ ssize_t fi_opx_cancel(fid_t fid, void *context) if (opx_ep->rx->caps & FI_MSG) { fi_opx_ep_rx_cancel(opx_ep->rx, FI_MSG, - (const union fi_opx_context *) context, + (const uintptr_t) context, FI_OPX_LOCK_NOT_REQUIRED); } @@ -1883,7 +2120,7 @@ ssize_t fi_opx_cancel(fid_t fid, void *context) if (opx_ep->rx->caps & FI_TAGGED) { fi_opx_ep_rx_cancel(opx_ep->rx, FI_TAGGED, - (const union fi_opx_context *) context, + (const uintptr_t) context, FI_OPX_LOCK_NOT_REQUIRED); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1923,7 +2160,7 @@ int fi_opx_alloc_default_rx_attr(struct fi_rx_attr **rx_attr) goto err; attr->caps = FI_OPX_DEFAULT_RX_CAPS; - attr->mode = FI_CONTEXT2 | FI_ASYNC_IOV; + attr->mode = FI_ASYNC_IOV; attr->op_flags = 0; attr->msg_order = FI_OPX_DEFAULT_MSG_ORDER; attr->size = SIZE_MAX; //FI_OPX_RX_SIZE; @@ -1953,7 +2190,7 @@ int fi_opx_alloc_default_tx_attr(struct fi_tx_attr **tx_attr) goto err; attr->caps = FI_OPX_DEFAULT_TX_CAPS; - attr->mode = FI_CONTEXT2 | FI_ASYNC_IOV; + attr->mode = FI_ASYNC_IOV; attr->op_flags = FI_TRANSMIT_COMPLETE; attr->msg_order = FI_OPX_DEFAULT_MSG_ORDER; attr->inject_size = FI_OPX_HFI1_PACKET_IMM; @@ -2208,19 +2445,13 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, goto err; } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - ret = fi_opx_fid_check(&dom->fid, FI_CLASS_DOMAIN, "domain"); if (ret) return ret; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - ret = fi_opx_check_info(info); if (ret) return ret; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - void *mem = NULL; mem = malloc(sizeof(struct fi_opx_ep) + FI_OPX_CACHE_LINE_SIZE); if (!mem) { @@ -2246,8 +2477,6 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, opx_ep->fr = fr; #endif - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - opx_ep->ep_fid.fid.fclass = FI_CLASS_EP; opx_ep->ep_fid.fid.context = context; opx_ep->ep_fid.fid.ops = &fi_opx_fi_ops; @@ -2324,6 +2553,30 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, } #endif +#if defined(OPX_HMEM) && HAVE_CUDA + int use_gdrcopy; + int gdrcopy_enabled = cuda_is_gdrcopy_enabled(); + + if (fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy", &use_gdrcopy) != FI_SUCCESS) { + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "FI_HMEM_CUDA_USE_GDRCOPY either not specified or invalid. Using default value of 1\n"); + use_gdrcopy = 1; /* Set to the libfabric default of FI_HMEM_CUDA_USE_GDRCOPY=1 */ + } + + if (gdrcopy_enabled == 1) { + if (use_gdrcopy == 1) { + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "GDRCopy has been requested and is available. If you wish to explicity disable GDRCopy, set FI_HMEM_CUDA_USE_GDRCOPY=0\n"); + } + } else if (use_gdrcopy == 1) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "GDRCopy has been requested but is not available on this system, set FI_HMEM_CUDA_USE_GDRCOPY=0 and try again.\n"); + fprintf(stderr, "%s:%s():%d GDRCopy cannot be used, set FI_HMEM_CUDA_USE_GDRCOPY=0 and try again. Returning FI_EOPNOTSUPP. \n", __FILE__, __func__, __LINE__); + errno = FI_EOPNOTSUPP; + goto err; + } else { + /* gdrcopy_enabled = 0 and use_gdrcopy = 0 */ + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "If GDRCopy is installed on this system, change FI_HMEM_CUDA_USE_GDRCOPY=0 to FI_HMEM_CUDA_USE_GDRCOPY=1 to enable GDRCopy. \n"); + } +#endif + *ep = &opx_ep->ep_fid; FI_OPX_DEBUG_COUNTERS_INIT(opx_ep->debug_counters); @@ -2392,21 +2645,18 @@ int fi_opx_ep_tx_check (struct fi_opx_ep_tx * tx, enum fi_av_type av_type) /* rx_op_flags is only checked for FI_PEEK | FI_CLAIM | FI_MULTI_RECV; * rx_op_flags is only used if FI_PEEK | FI_CLAIM; - * is_context_ext is only used if FI_PEEK | iovec; - * - * The "normal" data movement functions, such as fi_[t]recv(), can safely - * specify '0' for rx_op_flags, and is_context_ext, in order to reduce code path. * * See `fi_opx_ep_rx_process_context()` */ __attribute__((noinline)) void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fid_ep * ep = &opx_ep->ep_fid; @@ -2425,7 +2675,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "rx_op_flags & FI_PEEK searching unexpected queue\n"); __attribute__((__unused__)) bool from_hash_queue = false; - struct fi_opx_hfi1_ue_packet * uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind); + struct fi_opx_hfi1_ue_packet * uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind, hfi1_type); #ifndef FI_OPX_MATCH_HASH_DISABLE if (!uepkt && kind == FI_OPX_KIND_TAG) { @@ -2460,8 +2710,8 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, #endif } - fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, lock_required); + return; } @@ -2469,43 +2719,23 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, * did not find a match for this "peek"; notify the application * via completion queue error entry */ - - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - assert((ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT) != 0); - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - abort(); - } - ext->opx_context.flags = rx_op_flags | FI_OPX_CQ_CONTEXT_EXT; - } - - ext->err_entry.op_context = context; - ext->err_entry.flags = rx_op_flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = 0; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ENOMSG; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - + context->err_entry.flags = rx_op_flags; + context->err_entry.len = 0; + context->err_entry.buf = 0; + context->err_entry.data = 0; + context->err_entry.tag = 0; + context->err_entry.olen = 0; + context->err_entry.err = FI_ENOMSG; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; + context->byte_counter = 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "no match found on unexpected queue posting error\n"); - fi_opx_cq_enqueue_err(opx_ep->rx->cq, ext, lock_required); + fi_opx_cq_enqueue_err(opx_ep->rx->cq, context, lock_required); } else if (rx_op_flags & FI_CLAIM) { - assert((!(rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && !(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)) || - ((rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && (rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM))); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "rx_op_flags & FI_CLAIM complete receive operation\n"); /* only FI_CLAIM was specified @@ -2519,20 +2749,20 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, struct fi_opx_hfi1_ue_packet * claimed_pkt = context->claim; const unsigned is_intranode = - fi_opx_hfi_is_intranode(claimed_pkt->hdr.stl.lrh.slid); + opx_lrh_is_intranode(&(claimed_pkt->hdr), hfi1_type); - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &claimed_pkt->hdr, (union fi_opx_hfi1_packet_payload *)&claimed_pkt->payload, claimed_pkt->hdr.match.ofi_tag, context, - claimed_pkt->hdr.stl.bth.opcode, - rx_op_flags & FI_OPX_CQ_CONTEXT_EXT, + claimed_pkt->hdr.bth.opcode, OPX_MULTI_RECV_FALSE, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); /* ... and prepend the claimed uepkt to the ue free list. claimed_pkt->next should have been set to NULL at the time we @@ -2544,8 +2774,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, } else if ((static_flags & FI_MSG) && (rx_op_flags & FI_MULTI_RECV)) { /* TODO: HMEM not supported for multi-receive */ - assert(!(rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && - !(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)); + assert(!(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)); context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, context->src_addr); @@ -2561,7 +2790,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const union fi_opx_addr src_addr = { .fi = context->src_addr }; while (uepkt != NULL) { - unsigned is_intranode = fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid); + unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (fi_opx_ep_is_matching_packet(uepkt->tag, uepkt->origin_uid_fi, FI_OPX_MATCH_IGNORE_ALL, @@ -2591,18 +2820,18 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, /* the 'context->len' field will be updated to the * new multi-receive buffer free space as part of * the receive completion */ - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &uepkt->hdr, (union fi_opx_hfi1_packet_payload *)&uepkt->payload, uepkt->hdr.match.ofi_tag, context, - uepkt->hdr.stl.bth.opcode, - OPX_CONTEXT_EXTENDED_FALSE, + uepkt->hdr.bth.opcode, OPX_MULTI_RECV_TRUE, OPX_HMEM_FALSE, is_intranode, lock_required, - reliability); + reliability, + hfi1_type); /* remove this item from the ue list and prepend * the (now) completed uepkt to the ue free list. */ @@ -2621,7 +2850,8 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, // to ensure that any pending ops are completed (eg rendezvous multi-receive) if(context->byte_counter == 0) { assert(context->next == NULL); - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, + opx_ep->rx->cq_completed_ptr); } return; @@ -2639,7 +2869,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, * no unexpected headers were matched; add this match * information to the appropriate match queue */ - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->queue[kind].mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->queue[kind].mq); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(end)\n"); @@ -2648,14 +2878,16 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, void fi_opx_ep_rx_process_header_tag (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + uint32_t slid) { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const )payload, @@ -2665,18 +2897,22 @@ void fi_opx_ep_rx_process_header_tag (struct fid_ep * ep, origin_rs, is_intranode, lock_required, - reliability); + reliability, + hfi1_type, + slid); } void fi_opx_ep_rx_process_header_msg (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + uint32_t slid) { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const )payload, @@ -2686,93 +2922,80 @@ void fi_opx_ep_rx_process_header_msg (struct fid_ep * ep, origin_rs, is_intranode, lock_required, - reliability); + reliability, + hfi1_type, + slid); } void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs) { OPX_LOG_PKT(FI_LOG_DEBUG, FI_LOG_EP_DATA, "================ received a packet from the reliability service\n"); - const uint8_t opcode = hdr->stl.bth.opcode; - - struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const enum ofi_reliability_kind reliability_kind = opx_ep->reliability->state.kind; + const uint8_t opcode = hdr->bth.opcode; /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes = total_bytes - sizeof(union fi_opx_hfi1_packet_hdr); - - if (OFI_LIKELY(opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { - - if (reliability_kind == OFI_RELIABILITY_KIND_OFFLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_TAGGED, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_TAGGED, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); - } + uint16_t lrh_pktlen_le; + size_t total_bytes; + size_t payload_bytes; + uint32_t slid; + + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes = total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + slid = hdr->lrh_9B.slid; } else { - - if (reliability_kind == OFI_RELIABILITY_KIND_OFFLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_MSG, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_MSG, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); - } + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + payload_bytes = total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + slid = htons(((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))); + } + + if (OFI_LIKELY(opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT)) { + fi_opx_ep_rx_process_header(ep, hdr, + (const union fi_opx_hfi1_packet_payload * const) payload, + payload_bytes, + FI_TAGGED, + opcode, + origin_rs, + OPX_INTRANODE_FALSE, + FI_OPX_LOCK_NOT_REQUIRED, + OFI_RELIABILITY_KIND_ONLOAD, + OPX_HFI1_TYPE, + slid); + } else { + fi_opx_ep_rx_process_header(ep, hdr, + (const union fi_opx_hfi1_packet_payload * const) payload, + payload_bytes, + FI_MSG, + opcode, + origin_rs, + OPX_INTRANODE_FALSE, + FI_OPX_LOCK_NOT_REQUIRED, + OFI_RELIABILITY_KIND_ONLOAD, + OPX_HFI1_TYPE, + slid); } } __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * const rx, struct fi_opx_hfi1_ue_packet_slist * ue, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, - const uint32_t rank_inst) + const uint32_t rank_inst, + const uint64_t slid) { struct fi_opx_hfi1_ue_packet *uepkt = ofi_buf_alloc(rx->ue_packet_pool); - memcpy((void *)&uepkt->hdr, (const void *)hdr, sizeof(union fi_opx_hfi1_packet_hdr)); + memcpy((void *)&(uepkt->hdr), (const void *)hdr, OPX_HEADER_SIZE); if (payload != NULL) { @@ -2780,7 +3003,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * cons } uepkt->tag = hdr->match.ofi_tag; - uepkt->origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr); + uepkt->origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr, slid); /* DAOS Persistent Address Support: * Support: save rank information associated with this inbound packet. @@ -2797,27 +3020,29 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * cons } void fi_opx_ep_rx_append_ue_msg (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters) + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid) { fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_MSG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); FI_OPX_DEBUG_COUNTERS_MAX_OF(debug_counters->match.default_max_length, rx->queue[FI_OPX_KIND_MSG].ue.length); } void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters) + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid) { #ifndef FI_OPX_MATCH_HASH_DISABLE @@ -2826,31 +3051,32 @@ void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, rx->queue[FI_OPX_KIND_TAG].ue.length >= FI_OPX_MATCH_DEFAULT_UE_LIST_MAX_LENGTH)) { struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_rx_append_ue(rx, &rx->match_ue_tag_hash->ue, - hdr, payload, payload_bytes, 0, 0); + hdr, payload, payload_bytes, 0, 0, slid); fi_opx_match_ue_hash_append(uepkt, rx->match_ue_tag_hash, debug_counters); } else { fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_TAG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); } #else fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_TAG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); #endif FI_OPX_DEBUG_COUNTERS_MAX_OF(debug_counters->match.default_max_length, rx->queue[FI_OPX_KIND_TAG].ue.length); } void fi_opx_ep_rx_append_ue_egr (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, - const size_t payload_bytes) { + const size_t payload_bytes, + const uint64_t slid) { /* DAOS Persistent Address Support: * No need to retain rank related data for packets appended to the * MP Eager unexpected queue, because the mp_egr_id related data in * the packet is referenced instead. */ - fi_opx_ep_rx_append_ue(rx, &rx->mp_egr_queue.ue, hdr, payload, payload_bytes, 0, 0); + fi_opx_ep_rx_append_ue(rx, &rx->mp_egr_queue.ue, hdr, payload, payload_bytes, 0, 0, slid); } static void fi_opx_update_daos_av_rank(struct fi_opx_ep *opx_ep, fi_addr_t addr) @@ -2982,76 +3208,229 @@ ssize_t fi_opx_ep_tx_connect (struct fi_opx_ep *opx_ep, size_t count, } -FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_JKR) ssize_t fi_opx_send_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, dest_addr, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, dest_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_recv_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, src_addr, context); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, src_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, src_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, src_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, dest_addr); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, dest_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_recvmsg_FABRIC_DIRECT(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, msg, flags); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, msg, flags); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_senddata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, data, dest_addr, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, data, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, data, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, data, dest_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_injectdata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, data, dest_addr); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, data, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, data, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, data, dest_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 8d617da1bb2..18e1f48bcba 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -159,6 +159,29 @@ static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, return fd; } +void opx_reset_context(struct fi_opx_ep * opx_ep) +{ + fi_opx_compiler_msync_writes(); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL(OPX_HFI1_TYPE); + opx_ep->rx->state.hdrq.head = 0; + + if (opx_hfi_reset_context(opx_ep->hfi->fd)) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Send context reset failed: %d.\n", + errno); + abort(); + } + + opx_ep->tx->pio_state->fill_counter = 0; + opx_ep->tx->pio_state->scb_head_index = 0; + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_poll_sdma_completion(opx_ep); + opx_hfi1_sdma_process_pending(opx_ep); +} + + static int fi_opx_get_daos_hfi_rank_inst(const uint8_t hfi_unit_number, const uint32_t rank) { struct fi_opx_daos_hfi_rank_key key; @@ -233,7 +256,7 @@ void process_hfi_lookup(int hfi_unit, unsigned int lid) } -void fi_opx_init_hfi_lookup() +void fi_opx_init_hfi_lookup() { int hfi_unit = 0; int hfi_units = MIN(opx_hfi_get_num_units(), FI_OPX_MAX_HFIS); @@ -895,6 +918,9 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "Context configured with HFI=%d PORT=%d LID=0x%x JKEY=%d\n", context->hfi_unit, context->hfi_port, context->lid, context->jkey); + context->status_lasterr = 0; + context->status_check_next_usec = fi_opx_timer_now(&context->link_status_timestamp, &context->link_status_timer); + opx_print_context(context); return context; @@ -910,7 +936,7 @@ int init_hfi1_rxe_state (struct fi_opx_hfi1_context * context, rxe_state->hdrq.head = 0; assert(!(context->runtime_flags & HFI1_CAP_DMA_RTAIL)); - rxe_state->hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL; + rxe_state->hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL(OPX_HFI1_TYPE); /* OPX relies on RHF.SeqNum, not the RcvHdrTail if (context->runtime_flags & HFI1_CAP_DMA_RTAIL) { rxe_state->hdrq.rhf_seq = 0; @@ -935,7 +961,7 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(peer); const uint16_t dlid_be16 = (uint16_t)(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - if (fi_opx_hfi_is_intranode(dlid_be16)) { + if (opx_lid_is_intranode(dlid_be16)) { char buffer[128]; union fi_opx_addr addr; addr.raw64b = (uint64_t)peer; @@ -972,7 +998,6 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - struct fi_opx_ep * opx_ep = params->opx_ep; const uint64_t lrh_dlid = params->lrh_dlid; const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; @@ -991,26 +1016,26 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work return -FI_EAGAIN; } - union fi_opx_hfi1_packet_hdr * const tx_hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if(!tx_hdr) return rc; + if(!hdr) return rc; /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), because this is intranode and since it's a CTS packet, lrh.pktlen isn't used/needed */ - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid; - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | (params->niov << 48) | params->opcode; - tx_hdr->qw[5] = params->origin_byte_counter_vaddr; - tx_hdr->qw[6] = (uint64_t)params->rzv_comp; + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid; + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | (params->niov << 48) | params->opcode; + hdr->qw_9B[5] = params->origin_byte_counter_vaddr; + hdr->qw_9B[6] = (uint64_t)params->rzv_comp; union fi_opx_hfi1_packet_payload * const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr+1); + (union fi_opx_hfi1_packet_payload *)(hdr+1); uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ for(int i = 0; i < params->niov; i++) { @@ -1024,7 +1049,7 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work vaddr_with_offset += params->dput_iov[i].bytes; } - opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-SHM"); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1033,6 +1058,73 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work return FI_SUCCESS; } +int opx_hfi1_rx_rzv_rts_send_cts_intranode_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + const uint64_t lrh_dlid_16B = htons(lrh_dlid >> 16); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-SHM"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; + } + + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | (params->niov << 48) | params->opcode; + hdr->qw_16B[6] = params->origin_byte_counter_vaddr; + hdr->qw_16B[7] = (uint64_t)params->rzv_comp; + + union fi_opx_hfi1_packet_payload * const tx_payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); + + uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ + for(int i = 0; i < params->niov; i++) { + tx_payload->cts.iov[i].rbuf = vaddr_with_offset; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; + } + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-SHM"); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS (end)\n"); + + return FI_SUCCESS; +} + int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; @@ -1058,7 +1150,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((payload_bytes + 3) >> 2); - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ ((payload_bytes + 63) >> 6); /* payload blocks needed */ @@ -1073,6 +1165,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) &opx_ep->tx->force_credit_return, total_credits_needed); opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (total_credits_available < total_credits_needed) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN credits) (params=%p rzv_comp=%p context=%p)\n", @@ -1095,7 +1188,8 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) params->origin_rs, &psn_ptr, &replay, - params->reliability); + params->reliability, + OPX_HFI1_TYPE); if(OFI_UNLIKELY(psn == -1)) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN psn/replay) (params=%p rzv_comp=%p context=%p)\n", @@ -1110,19 +1204,20 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | - OPX_PBC_LEN(pbc_dws) | + + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.cts_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; - replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | + replay->scb.scb_9B.hdr.qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t) lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | + replay->scb.scb_9B.hdr.qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + replay->scb.scb_9B.hdr.qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn; + replay->scb.scb_9B.hdr.qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + replay->scb.scb_9B.hdr.qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | ((uint64_t) params->tid_info.npairs << 32) | (params->niov << 48) | params->opcode; - replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t) params->rzv_comp; + replay->scb.scb_9B.hdr.qw_9B[5] = params->origin_byte_counter_vaddr; + replay->scb.scb_9B.hdr.qw_9B[6] = (uint64_t) params->rzv_comp; union fi_opx_hfi1_packet_payload *const tx_payload = (union fi_opx_hfi1_packet_payload *) replay->payload; @@ -1172,12 +1267,169 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - params->slid, params->origin_rs, params->origin_rx, psn_ptr, replay, - params->reliability); + params->reliability, + OPX_HFI1_TYPE); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t lrh_dlid_16B = htons(params->lrh_dlid >> 16); + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, HFI -- RENDEZVOUS %s RTS (begin) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + assert (params->rzv_comp->context->byte_counter >= params->dput_iov[0].bytes); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); + const uint64_t tid_payload = params->tid_info.npairs + ? ((params->tid_info.npairs + 4) * sizeof(params->tidpairs[0])) + : 0; + const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "payload_bytes = %ld\n", payload_bytes); + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (((payload_bytes + 7) & -8) >> 2) + /* 16B is QW length/padded */ + 2; /* ICRC/tail */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + const uint16_t total_credits_needed = 1 + /* packet header */ + ((payload_bytes + 63) >> 6); /* payload blocks needed */ + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); + + if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { + fi_opx_compiler_msync_writes(); + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + if (total_credits_available < total_credits_needed) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN credits) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN psn/replay) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return -FI_EAGAIN; + } + + assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); + + // The "memcopy first" code is here as an alternative to the more complicated + // direct write to pio followed by memory copy of the reliability buffer + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.cts_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "replay->scb_16B.qw0 = %#lx pbc_dws = %ld\n", replay->scb.scb_16B.qw0, pbc_dws); + replay->scb.scb_16B.hdr.qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t) lrh_qws << 20); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "lrh_qws = %d replay->scb_16B.hdr.lrh_16B.pktlen = %d\n", lrh_qws, replay->scb.scb_16B.hdr.lrh_16B.pktlen); + replay->scb.scb_16B.hdr.qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + + replay->scb.scb_16B.hdr.qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + replay->scb.scb_16B.hdr.qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn; + replay->scb.scb_16B.hdr.qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + replay->scb.scb_16B.hdr.qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | + ((uint64_t) params->tid_info.npairs << 32) | + (params->niov << 48) | params->opcode; + replay->scb.scb_16B.hdr.qw_16B[6] = params->origin_byte_counter_vaddr; + + replay->scb.scb_16B.hdr.qw_16B[7] = (uint64_t) params->rzv_comp; + + union fi_opx_hfi1_packet_payload *const tx_payload = + (union fi_opx_hfi1_packet_payload *) (replay->payload); + + assert(((uint8_t *)tx_payload) == ((uint8_t *)&(replay->data))); + + uintptr_t vaddr_with_offset = params->tid_info.npairs ? + ((uint64_t)params->dst_vaddr & -64) : + params->dst_vaddr; /* receive buffer virtual address */ + + for (int i = 0; i < params->niov; i++) { + tx_payload->cts.iov[i].rbuf = vaddr_with_offset; + tx_payload->cts.iov[i].sbuf = params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; + } + + /* copy tidpairs to packet */ + if (params->tid_info.npairs) { + assert(params->tid_info.npairs < FI_OPX_MAX_DPUT_TIDPAIRS); + assert(params->tidpairs[0] != 0); + assert(params->niov == 1); + assert(params->rzv_comp->context->byte_counter >= params->dput_iov[0].bytes); + + /* coverity[missing_lock] */ + tx_payload->tid_cts.tid_offset = params->tid_info.offset; + tx_payload->tid_cts.ntidpairs = params->tid_info.npairs; + tx_payload->tid_cts.origin_byte_counter_adjust = params->tid_info.origin_byte_counter_adj; + for (int i = 0; i < params->tid_info.npairs; ++i) { + tx_payload->tid_cts.tidpairs[i] = params->tidpairs[i]; + } + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "fi_opx_reliability_service_do_replay &opx_ep->reliability->service %p, replay %p\n",&opx_ep->reliability->service, replay); + fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end) (params=%p rzv_comp=%p context=%p)\n", @@ -1193,7 +1445,7 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, struct fi_opx_hfi1_rx_rzv_rts_params *params, const uint64_t niov, const uint64_t immediate_data, - const uint64_t immediate_end_block_count, + const uint64_t immediate_tail, const uint64_t is_hmem, const uint64_t is_intranode, const enum fi_hmem_iface iface, @@ -1202,12 +1454,13 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, if (is_intranode || !opx_ep->use_expected_tid_rzv || (niov != 1) + || (params->dput_iov[0].bytes < opx_ep->tx->tid_min_payload_bytes) || (opcode != FI_OPX_HFI_DPUT_OPCODE_RZV && opcode != FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG) || !fi_opx_hfi1_sdma_use_sdma(opx_ep, params->dput_iov[0].bytes, opcode, is_hmem, OPX_INTRANODE_FALSE) || (immediate_data == 0) - || (immediate_end_block_count == 0)) { + || (immediate_tail == 0)) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_ineligible); return 0; @@ -1233,7 +1486,7 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, /* First adjust for the start page alignment, using immediate data that was sent.*/ const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; const int64_t length_with_adjustment = params->dput_iov[0].bytes + alignment_adjustment; - const int64_t new_length = length_with_adjustment & -64; + const int64_t new_length = length_with_adjustment & -8; const int64_t len_difference = new_length - params->dput_iov[0].bytes; if (alignment_adjustment) { @@ -1329,7 +1582,12 @@ union fi_opx_hfi1_deferred_work * opx_hfi1_rx_rzv_rts_tid_prep_cts( } assert(cur_addr_range_tid_len <= cts_params->rzv_comp->context->byte_counter); - cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } cts_params->work_elem.work_type = OPX_WORK_TYPE_PIO; return cts_work; @@ -1355,7 +1613,12 @@ int opx_hfi1_rx_rzv_rts_tid_fallback(union fi_opx_hfi1_deferred_work *work, params->dst_vaddr = params->dput_iov[params->cur_iov].rbuf; params->tid_info.npairs = 0; - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } params->work_elem.work_type = OPX_WORK_TYPE_PIO; params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; @@ -1367,7 +1630,8 @@ int opx_hfi1_rx_rzv_rts_tid_fallback(union fi_opx_hfi1_deferred_work *work, params->rzv_comp, params->rzv_comp->context); - return opx_hfi1_rx_rzv_rts_send_cts(work); + + return params->work_elem.work_fn(work); } int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) @@ -1427,7 +1691,12 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) if (last_cts) { assert(cts_work == work); - assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts); + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts); + } else { + assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts_16B); + } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS RTS TID SETUP (end) SUCCESS (params=%p rzv_comp=%p context=%p)\n", params, @@ -1438,11 +1707,12 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) .expected_receive.rts_tid_setup_success); // This is the "FI_SUCCESS" exit point for this function - return opx_hfi1_rx_rzv_rts_send_cts(cts_work); + return cts_work->work_elem.work_fn(cts_work); } assert(cts_work != work); - int rc = opx_hfi1_rx_rzv_rts_send_cts(cts_work); + + int rc = cts_work->work_elem.work_fn(cts_work); if (rc == FI_SUCCESS) { OPX_BUF_FREE(cts_work); } else { @@ -1479,81 +1749,467 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) return -FI_EAGAIN; } -void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, - const void * const hdr, const void * const payload, - const uint8_t u8_rx, const uint64_t niov, - uintptr_t origin_byte_counter_vaddr, - union fi_opx_context *const target_context, - const uintptr_t dst_vaddr, - const enum fi_hmem_iface dst_iface, - const uint64_t dst_device, - const uint64_t immediate_data, - const uint64_t immediate_end_block_count, - const struct fi_opx_hmem_iov *src_iovs, - uint8_t opcode, - const unsigned is_intranode, - const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx) +int opx_hfi1_rx_rzv_rts_send_etrunc_intranode(union fi_opx_hfi1_deferred_work *work) { - const union fi_opx_hfi1_packet_hdr * const hfi1_hdr = - (const union fi_opx_hfi1_packet_hdr * const) hdr; - - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); - union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); - assert(work != NULL); struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - params->opx_ep = opx_ep; - params->work_elem.slist_entry.next = NULL; - assert(niov <= MIN(FI_OPX_MAX_HMEM_IOV, FI_OPX_MAX_DPUT_IOV)); + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; - const struct fi_opx_hmem_iov *src_iov = src_iovs; - uint64_t is_hmem = dst_iface; - uint64_t rbuf_offset = 0; - for(int i = 0; i < niov; i++) { -#ifdef OPX_HMEM - is_hmem |= src_iov->iface; -#endif - params->dput_iov[i].sbuf = src_iov->buf; - params->dput_iov[i].sbuf_iface = src_iov->iface; - params->dput_iov[i].sbuf_device = src_iov->device; - params->dput_iov[i].rbuf = dst_vaddr + rbuf_offset; - params->dput_iov[i].rbuf_iface = dst_iface; - params->dput_iov[i].rbuf_device = dst_device; - params->dput_iov[i].bytes = src_iov->len; - rbuf_offset += src_iov->len; - ++src_iov; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (begin)\n"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; } - if (is_intranode) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u\n",is_intranode ); - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; - params->work_elem.work_type = OPX_WORK_TYPE_SHM; - if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { - params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; - } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hfi1_hdr->stl.lrh.slid); - assert(hfi_lookup); - params->target_hfi_unit = hfi_lookup->hfi_unit; + union opx_hfi1_packet_hdr * const tx_hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!tx_hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + tx_hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid; + tx_hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + tx_hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + tx_hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + tx_hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode; + tx_hdr->qw_9B[5] = params->origin_byte_counter_vaddr; + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (end)\n"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc_intranode_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + const uint64_t lrh_dlid_16B = htons(lrh_dlid >> 16); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS ETRUNC (begin)\n"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; + } + + union opx_hfi1_packet_hdr * const tx_hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!tx_hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + tx_hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); + tx_hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + tx_hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + tx_hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + tx_hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + tx_hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode; + tx_hdr->qw_16B[6] = params->origin_byte_counter_vaddr; + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (end)\n"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (begin)\n"); + + const uint64_t pbc_dws = + 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN credits)\n"); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN psn/replay)\n"); + return -FI_EAGAIN; + } + + volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + fi_opx_store_and_copy_scb_9B(scb, &replay->scb.scb_9B, + opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | + ((uint64_t) lrh_dws << 32), + opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], + opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode, + params->origin_byte_counter_vaddr, 0); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + /* consume one credit */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + /* save the updated txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (end)"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t lrh_dlid_16B = htons(params->lrh_dlid >> 16); + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (begin)\n"); + + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; /* ICRC/tail */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + // Note: Only need 1 credit here for the message truncation error case. Just + // the opcode and origin_byte_counter_vaddr is needed for replaying back to the + // sender. + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 2) < 2)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 2) < 2) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN credits)\n"); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN psn/replay)\n"); + return -FI_EAGAIN; + } + + volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + fi_opx_store_and_copy_scb_16B(scb, &replay->scb.scb_16B, + opx_ep->rx->tx.cts_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t) lrh_qws << 20), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[4], + opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode, + params->origin_byte_counter_vaddr); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_and_copy_qw(scb2, &replay->scb.scb_16B.hdr.qw_16B[7], + 0, 0, 0, 0, 0, 0, 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + /* save the updated txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (end)"); + + return FI_SUCCESS; +} + +void fi_opx_hfi1_rx_rzv_rts_etrunc (struct fi_opx_ep *opx_ep, + const union opx_hfi1_packet_hdr * const hdr, + const uint8_t u8_rx, + uintptr_t origin_byte_counter_vaddr, + const unsigned is_intranode, + const enum ofi_reliability_kind reliability, + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) +{ + + union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); + assert(work != NULL); + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + params->opx_ep = opx_ep; + params->work_elem.slist_entry.next = NULL; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u, opcode=%u\n", + is_intranode, FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC); + + if (is_intranode) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_intranode; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_intranode_16B; + } + params->work_elem.work_type = OPX_WORK_TYPE_SHM; + + uint32_t lid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + lid = hdr->lrh_9B.slid; + else + lid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (lid == opx_ep->rx->self.uid.lid) { + params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; + } else { + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(lid); + assert(hfi_lookup); + params->target_hfi_unit = hfi_lookup->hfi_unit; + } + } else { + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_16B; + } + params->work_elem.work_type = OPX_WORK_TYPE_PIO; + params->target_hfi_unit = 0xFF; + } + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + if (hfi1_type & OPX_HFI1_WFR) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_9B.slid << 16; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid) << 16; // Send CTS to the SLID that sent RTS + } + + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); + params->origin_rx = hdr->rendezvous.origin_rx; + params->origin_rs = hdr->rendezvous.origin_rs; + params->u8_rx = u8_rx; + params->u32_extended_rx = u32_extended_rx; + params->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + params->is_intranode = is_intranode; + params->reliability = reliability; + params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC; + + int rc = params->work_elem.work_fn(work); + if(rc == FI_SUCCESS) { + OPX_BUF_FREE(work); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_SUCCESS\n"); + return; + } + assert(rc == -FI_EAGAIN); + /* Try again later*/ + assert(work->work_elem.slist_entry.next == NULL); + slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); +} + +void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, + const union opx_hfi1_packet_hdr * const hdr, + const void * const payload, + const uint8_t u8_rx, const uint64_t niov, + uintptr_t origin_byte_counter_vaddr, + struct opx_context *const target_context, + const uintptr_t dst_vaddr, + const enum fi_hmem_iface dst_iface, + const uint64_t dst_device, + const uint64_t immediate_data, + const uint64_t immediate_end_bytes, + const struct fi_opx_hmem_iov *src_iovs, + uint8_t opcode, + const unsigned is_intranode, + const enum ofi_reliability_kind reliability, + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) +{ + + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); + union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); + assert(work != NULL); + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + params->opx_ep = opx_ep; + params->work_elem.slist_entry.next = NULL; + + assert(niov <= MIN(FI_OPX_MAX_HMEM_IOV, FI_OPX_MAX_DPUT_IOV)); + + const struct fi_opx_hmem_iov *src_iov = src_iovs; + uint64_t is_hmem = dst_iface; + uint64_t rbuf_offset = 0; + for(int i = 0; i < niov; i++) { +#ifdef OPX_HMEM + is_hmem |= src_iov->iface; +#endif + params->dput_iov[i].sbuf = src_iov->buf; + params->dput_iov[i].sbuf_iface = src_iov->iface; + params->dput_iov[i].sbuf_device = src_iov->device; + params->dput_iov[i].rbuf = dst_vaddr + rbuf_offset; + params->dput_iov[i].rbuf_iface = dst_iface; + params->dput_iov[i].rbuf_device = dst_device; + params->dput_iov[i].bytes = src_iov->len; + rbuf_offset += src_iov->len; + ++src_iov; + } + + if (is_intranode) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u\n",is_intranode ); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; + else + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode_16B; + params->work_elem.work_type = OPX_WORK_TYPE_SHM; + + uint32_t lid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + lid = hdr->lrh_9B.slid; + else + lid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (lid == opx_ep->rx->self.uid.lid) { + params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; + } else { + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(lid); + assert(hfi_lookup); + params->target_hfi_unit = hfi_lookup->hfi_unit; } } else { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "opx_ep->use_expected_tid_rzv=%u niov=%lu opcode=%u\n", opx_ep->use_expected_tid_rzv, niov, params->opcode); - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } params->work_elem.work_type = OPX_WORK_TYPE_PIO; params->target_hfi_unit = 0xFF; } params->work_elem.completion_action = NULL; params->work_elem.payload_copy = NULL; params->work_elem.complete = false; - params->lrh_dlid = (hfi1_hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); - params->slid = hfi1_hdr->stl.lrh.slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + if (hfi1_type & OPX_HFI1_WFR) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_9B.slid << 16; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid) << 16; // Send CTS to the SLID that sent RTS + } + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); - params->origin_rx = hfi1_hdr->rendezvous.origin_rx; - params->origin_rs = hfi1_hdr->rendezvous.origin_rs; + params->origin_rx = hdr->rendezvous.origin_rx; + params->origin_rs = hdr->rendezvous.origin_rs; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; params->niov = niov; @@ -1575,7 +2231,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, if (opx_hfi1_rx_rzv_rts_tid_eligible(opx_ep, params, niov, immediate_data, - immediate_end_block_count, + immediate_end_bytes, is_hmem, is_intranode, dst_iface, opcode)) { params->tid_info.cur_addr_range.buf = params->dput_iov[0].rbuf; @@ -1591,7 +2247,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { OPX_BUF_FREE(work); - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_SUCCESS\n"); return; } @@ -1599,18 +2255,12 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); - OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); } int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) { - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - const uint16_t lrh_dws = htons(pbc_dws - 1); - struct fi_opx_hfi1_rx_dput_fence_params *params = &work->fence; struct fi_opx_ep * opx_ep = params->opx_ep; @@ -1625,31 +2275,58 @@ int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) return -FI_EAGAIN; } - union fi_opx_hfi1_packet_hdr *const tx_hdr = + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if (tx_hdr == NULL) { + if (hdr == NULL) { return rc; } - tx_hdr->qw[0] = opx_ep->rx->tx.dput.hdr.qw[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.dput.hdr.qw[1] | params->bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.dput.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.dput.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; - tx_hdr->qw[5] = (uint64_t)params->cc; - tx_hdr->qw[6] = params->bytes_to_fence; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint64_t pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + + hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; + hdr->qw_9B[5] = (uint64_t)params->cc; + hdr->qw_9B[6] = params->bytes_to_fence; + } else { + const uint64_t pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; /* ICRC/tail */ + const uint16_t lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + hdr->qw_16B[0] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[0] | + ((uint64_t)(params->lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[1] | + ((uint64_t)((params->lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[2] | params->bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_16B[6] = (uintptr_t)params->cc; + hdr->qw_16B[7] = params->bytes_to_fence; + } - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); return FI_SUCCESS; } void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint8_t u8_rx, - const uint32_t u32_extended_rx) + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) { union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); assert(work != NULL); @@ -1662,16 +2339,26 @@ void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, params->work_elem.complete = false; params->work_elem.work_type = OPX_WORK_TYPE_SHM; - params->lrh_dlid = (hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid; + params->bth_rx = (uint64_t)u8_rx << 56; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; params->bytes_to_fence = hdr->dput.target.fence.bytes_to_fence; params->cc = (struct fi_opx_completion_counter *) hdr->dput.target.fence.completion_counter; - if (hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { + uint32_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + slid = hdr->lrh_9B.slid; + else + slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hdr->stl.lrh.slid); + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(slid); assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } @@ -1709,6 +2396,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) * as the dlid for the lrh header of the outgoing packet */ const uint64_t lrh_dlid = params->lrh_dlid; const uint64_t bth_rx = ((uint64_t)u8_rx) << 56; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; enum fi_hmem_iface cbuf_iface = params->compare_iov.iface; uint64_t cbuf_device = params->compare_iov.device; @@ -1761,34 +2449,52 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; while (bytes_to_send > 0) { - uint64_t bytes_to_send_this_packet = MIN(bytes_to_send + params->payload_bytes_for_iovec, + uint64_t bytes_to_send_this_packet; + uint64_t blocks_to_send_in_this_packet; + uint64_t pbc_dws; + uint16_t lrh_dws; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + bytes_to_send_this_packet = MIN(bytes_to_send + params->payload_bytes_for_iovec, max_bytes_per_packet); - uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; - uint64_t blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); - - const uint64_t pbc_dws = 2 + /* pbc */ + uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; + blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); + pbc_dws = 2 + /* pbc */ 2 + /* lrh */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (blocks_to_send_in_this_packet << 4); - - const uint16_t lrh_dws = htons(pbc_dws - 1); + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + /* 1 QW for hdr that spills to 2nd cacheline + 1 QW for ICRC/tail */ + const uint64_t additional_hdr_tail_byte = 2 * 8; + uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, + max_bytes_per_packet)); + uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; + blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); + bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 7 + /* kdeth */ + (blocks_to_send_in_this_packet << 4); // ICRC and the kdeth in the second cacheline are accounted for here + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } uint64_t bytes_sent; if (is_intranode) { uint64_t pos; - union fi_opx_hfi1_packet_hdr * tx_hdr = + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if(!tx_hdr) return rc; + if(!hdr) return rc; union fi_opx_hfi1_packet_payload * const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr+1); + (union fi_opx_hfi1_packet_payload *)(hdr+1); bytes_sent = opx_hfi1_dput_write_header_and_payload( - opx_ep, tx_hdr, tx_payload, + opx_ep, hdr, tx_payload, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, @@ -1798,13 +2504,14 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) params->bytes_sent, &sbuf, sbuf_iface, sbuf_device, (uint8_t **) ¶ms->compare_vaddr, - cbuf_iface, cbuf_device, &rbuf); + cbuf_iface, cbuf_device, &rbuf, + hfi1_type); - opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); } else { union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t credits_needed = blocks_to_send_in_this_packet - + 1 /* header */; + + const uint16_t credits_needed = blocks_to_send_in_this_packet + 1 /* header */; uint32_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, @@ -1827,8 +2534,9 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - u8_rx, params->origin_rs, &psn_ptr, &replay, reliability); + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, + params->slid, u8_rx, params->origin_rs, &psn_ptr, + &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1838,13 +2546,20 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) (union fi_opx_hfi1_packet_payload *) replay->payload; assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | - OPX_PBC_LEN(pbc_dws) | - OPX_PBC_CR(opx_ep->tx->force_credit_return) | - params->pbc_dlid; + if (hfi1_type & OPX_HFI1_JKR) { + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + params->pbc_dlid; + } else { + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + params->pbc_dlid; + } bytes_sent = opx_hfi1_dput_write_header_and_payload( - opx_ep, &replay->scb.hdr, replay_payload, + opx_ep, OPX_REPLAY_HDR(replay), replay_payload, opcode, psn, lrh_dws, op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, @@ -1854,11 +2569,26 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) params->bytes_sent, &sbuf, sbuf_iface, sbuf_device, (uint8_t **) ¶ms->compare_vaddr, - cbuf_iface, cbuf_device, &rbuf); + cbuf_iface, cbuf_device, &rbuf, hfi1_type); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); if (opcode == FI_OPX_HFI_DPUT_OPCODE_PUT) { + if (bytes_to_send == bytes_sent) { + /* This is the last packet to send for this PUT. + Turn on the immediate ACK request bit so the + user gets control of their buffer back ASAP */ + const uint64_t set_ack_bit = (uint64_t)htonl(0x80000000); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + replay->scb.scb_9B.hdr.qw_9B[2] |= set_ack_bit; + replay->scb.scb_9B.hdr.dput.target.last_bytes = + replay->scb.scb_9B.hdr.dput.target.bytes; + } else { + replay->scb.scb_16B.hdr.qw_16B[3] |= set_ack_bit; + replay->scb.scb_16B.hdr.dput.target.last_bytes = + replay->scb.scb_16B.hdr.dput.target.bytes; + } + } fi_opx_reliability_client_replay_register_with_update( &opx_ep->reliability->state, params->slid, params->origin_rs, u8_rx, psn_ptr, replay, cc, @@ -1870,8 +2600,8 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) fi_opx_compiler_msync_writes(); fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, params->slid, - params->origin_rs, u8_rx, psn_ptr, replay, reliability); + &opx_ep->reliability->state, + params->origin_rs, u8_rx, psn_ptr, replay, reliability, hfi1_type); } } @@ -1887,7 +2617,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) if (opcode == FI_OPX_HFI_DPUT_OPCODE_PUT && is_intranode) { // RMA-type put, so send a ping/fence to better latency fi_opx_shm_write_fence(opx_ep, params->target_hfi_unit, u8_rx, lrh_dlid, cc, params->bytes_sent, - params->u32_extended_rx); + params->u32_extended_rx, hfi1_type); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-DPUT-%s", is_intranode ? "SHM" : "HFI"); @@ -1993,7 +2723,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) // We should never be in this function for intranode ops assert(!params->is_intranode); - assert(opx_ep->rx->tx.dput.hdr.stl.lrh.slid != params->slid); + assert(opx_ep->rx->tx.dput_9B.hdr.lrh_9B.slid != params->slid); assert(((opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH || opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH) && @@ -2145,21 +2875,41 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) // Round packet_bytes up to the next multiple of 4, // then divide by 4 to get the correct number of dws. uint64_t payload_dws = ((packet_bytes + 3) & -4) >> 2; - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - payload_dws; - - const uint16_t lrh_dws = htons(pbc_dws - 1); + uint64_t pbc_dws; + uint16_t lrh_dws; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + payload_dws; + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 + /* ICRC/tail */ + payload_dws; + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } assert(replay != NULL); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | OPX_PBC_LEN(pbc_dws) | - params->pbc_dlid; + + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | + params->pbc_dlid; + } else { + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | + params->pbc_dlid; + } + uint64_t bytes_sent = opx_hfi1_dput_write_header_and_iov( - opx_ep, &replay->scb.hdr, + opx_ep, OPX_REPLAY_HDR(replay), replay->iov, opcode, lrh_dws, op64, dt64, lrh_dlid, bth_rx, packet_bytes, key, @@ -2168,7 +2918,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) params->rma_request_vaddr, params->bytes_sent, &sbuf_tmp, (uint8_t **) ¶ms->compare_vaddr, - &rbuf); + &rbuf, OPX_HFI1_TYPE); params->cc->byte_counter += params->payload_bytes_for_iovec; fi_opx_hfi1_sdma_add_packet(params->sdma_we, replay, packet_bytes); @@ -2188,6 +2938,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) return -FI_EAGAIN; } + opx_hfi1_sdma_flush(opx_ep, params->sdma_we, ¶ms->sdma_reqs, @@ -2226,8 +2977,6 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) assert((*params->origin_byte_counter) >= params->origin_bytes_sent); *params->origin_byte_counter -= params->origin_bytes_sent; params->origin_byte_counter = NULL; - } else { - assert(params->origin_bytes_sent <= *params->origin_byte_counter); } params->work_elem.work_type = OPX_WORK_TYPE_LAST; params->work_elem.work_fn = fi_opx_hfi1_dput_sdma_pending_completion; @@ -2268,7 +3017,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // We should never be in this function for intranode ops assert(!params->is_intranode); - assert(opx_ep->rx->tx.dput.hdr.stl.lrh.slid != params->slid); + assert(opx_ep->rx->tx.dput_9B.hdr.lrh_9B.slid != params->slid); assert((opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) && (params->payload_bytes_for_iovec == 0)); @@ -2569,17 +3318,38 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // Round packet_bytes up to the next multiple of 4, // then divide by 4 to get the correct number of dws. - uint64_t payload_dws = (packet_bytes + 3) >> 2; - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - payload_dws; + uint64_t pbc_dws; + uint16_t lrh_dws; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + uint64_t payload_dws = (packet_bytes + 3) >> 2; + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + payload_dws; + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + uint64_t payload_dws = ((packet_bytes + 7) & -8) >> 2;/* 16B is QW length/padded */ + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 + /* ICRC/tail */ + payload_dws; + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } - const uint16_t lrh_dws = htons(pbc_dws - 1); + assert(replay != NULL); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | OPX_PBC_LEN(pbc_dws) | - params->pbc_dlid; + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | + params->pbc_dlid; + } else { + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | + params->pbc_dlid; + } /* The fetch_vaddr and cbuf arguments are only used for atomic fetch operations, which by their one- @@ -2587,14 +3357,14 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) hard-coded to 0/NULL respectively */ uint64_t bytes_sent = opx_hfi1_dput_write_header_and_iov( - opx_ep, &replay->scb.hdr, + opx_ep, OPX_REPLAY_HDR(replay), replay->iov, opcode, lrh_dws, op64, dt64, lrh_dlid, bth_rx, packet_bytes, key, 0ul, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf_tmp, - NULL, &rbuf); + NULL, &rbuf, OPX_HFI1_TYPE); /* tid packets are page aligned and 4k/8k length except first TID and last (remnant) packet */ assert((tididx == 0) || (first_tid_last_packet) || @@ -2675,7 +3445,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ep, struct fi_opx_mr * opx_mr, - const void * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, @@ -2691,9 +3461,9 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ void (*completion_action)(union fi_opx_hfi1_deferred_work * work_state), const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx) { - const union fi_opx_hfi1_packet_hdr * const hfi1_hdr = - (const union fi_opx_hfi1_packet_hdr * const) hdr; + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) +{ union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); struct fi_opx_hfi1_dput_params *params = &work->dput; @@ -2704,9 +3474,14 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ params->work_elem.complete = false; params->opx_ep = opx_ep; params->opx_mr = opx_mr; - params->lrh_dlid = (hfi1_hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); - params->slid = hfi1_hdr->stl.lrh.slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = (htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid)) << 16; // Send dput to the SLID that sent CTS + } + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->origin_rs = origin_rs; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; @@ -2729,10 +3504,10 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ params->is_intranode = is_intranode; params->reliability = reliability; if (is_intranode) { - if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { + if (params->slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hfi1_hdr->stl.lrh.slid); + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(params->slid); assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } @@ -2756,7 +3531,7 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ uint32_t *tidpairs = NULL; if (opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) { - ntidpairs = hfi1_hdr->cts.target.vaddr.ntidpairs; + ntidpairs = hdr->cts.target.vaddr.ntidpairs; if (ntidpairs) { union fi_opx_hfi1_packet_payload *tid_payload = (union fi_opx_hfi1_packet_payload *) payload; @@ -2820,13 +3595,15 @@ uint64_t num_sends; uint64_t total_sendv_bytes; ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -2836,7 +3613,6 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); assert(niov <= MIN(FI_OPX_MAX_DPUT_IOV, FI_OPX_MAX_HMEM_IOV)); - *origin_byte_counter_value = total_len; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(hmem_non_system); @@ -2852,89 +3628,558 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz // Calculate space for each IOV, then add in the origin_byte_counter_vaddr, // and round to the next 64-byte block. + const uint64_t icrc_and_tail_block = ((hfi1_type == OPX_HFI1_JKR) ? 1 : 0); const uint64_t payload_blocks_total = ((niov * sizeof(struct fi_opx_hmem_iov)) + - sizeof(uintptr_t) + 63) >> 6; + sizeof(uintptr_t) + icrc_and_tail_block + 63) >> 6; assert(payload_blocks_total > 0 && payload_blocks_total < (FI_OPX_HFI1_PACKET_MTU >> 6)); - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lhr */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_blocks_total << 4); + uint64_t pbc_dws; + uint16_t lrh_dws; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (payload_blocks_total << 4); - const uint16_t lrh_dws = htons(pbc_dws - 1); + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (payload_blocks_total << 4); /* ICRC/tail is accounted for here */ + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE( fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- RENDEZVOUS RTS Noncontig (begin) context %p\n",context); + "===================================== SENDV, SHM -- RENDEZVOUS RTS Noncontig (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-NONCONTIG-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( &opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); if (!hdr) return rc; - hdr->qw[0] = opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->rzv.hdr.qw[1] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = total_len; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); + hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; + hdr->qw_9B[5] = total_len; + hdr->qw_9B[6] = tag; + } else { + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); + hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; + hdr->qw_16B[6] = total_len; + hdr->qw_16B[7] = tag; + } + + union fi_opx_hfi1_packet_payload *const payload = + (union fi_opx_hfi1_packet_payload *)(hdr + 1); + + payload->rendezvous.noncontiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; + struct fi_opx_hmem_iov *payload_iov = &payload->rendezvous.noncontiguous.iov[0]; + struct iovec *input_iov = (struct iovec *) iov; + + for (int i = 0; i < niov; i++) { +#ifdef OPX_HMEM + // TODO: desc is plumbed into this function as a single pointer + // only representing the first IOV. It should be changed + // to void ** to get an array of desc, one for each IOV. + // For now, just use the first iov's desc, assuming all + // the IOVs will reside in the same HMEM space. + FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_iface != FI_HMEM_SYSTEM, hmem_non_system); +#endif + payload_iov->buf = (uintptr_t) input_iov->iov_base; + payload_iov->len = input_iov->iov_len; + payload_iov->device = hmem_device; + payload_iov->iface = hmem_iface; + payload_iov++; + input_iov++; + } + + FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_non_system, + opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.rzv_noncontig); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, total_len, + lock_required, tag, caps); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-NONCONTIG-SHM"); + FI_DBG_TRACE( + fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); + return FI_SUCCESS; + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-HFI"); + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + const uint16_t total_credits_needed = 1 + /* packet header */ + payload_blocks_total; /* packet payload */ + + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, total_credits_needed); + if (total_credits_available < total_credits_needed) { + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; + } + } + + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = total_len; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); + return -FI_EAGAIN; + } + + struct fi_opx_hmem_iov hmem_iov[FI_OPX_MAX_HMEM_IOV]; + unsigned hmem_niov = MIN(niov, FI_OPX_MAX_HMEM_IOV); + for (int i = 0; i < hmem_niov; ++i) { + hmem_iov[i].buf = (uintptr_t) iov[i].iov_base; + hmem_iov[i].len = iov[i].iov_len; +#ifdef OPX_HMEM + uint64_t device; + hmem_iov[i].iface = fi_opx_hmem_get_iface(iov[i].iov_base, desc, &device); + hmem_iov[i].device = device; + FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_iov[i].iface != FI_HMEM_SYSTEM, hmem_non_system); +#else + hmem_iov[i].iface = FI_HMEM_SYSTEM; + hmem_iov[i].device = 0; +#endif + } + FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_non_system, + opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.rzv_noncontig); + + assert(opx_ep->tx->rzv_9B.qw0 == 0); + const uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); + + volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + uint64_t local_temp[16] = {0}; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), + opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, + total_len, tag); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); + } else { + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->rzv_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), + opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, + total_len); + } + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + /* consume one credit for the packet header */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + unsigned credits_consumed = 1; +#endif + + /* write the payload */ + uint64_t *iov_qws = (uint64_t *) &hmem_iov[0]; + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + uint64_t local_temp_payload[16] = {0}; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, + origin_byte_counter_vaddr, + iov_qws[0], + iov_qws[1], + iov_qws[2], + iov_qws[3], + iov_qws[4], + iov_qws[5], + iov_qws[6]); + iov_qws += 7; + } else { + fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, + tag, + origin_byte_counter_vaddr, + iov_qws[0], + iov_qws[1], + iov_qws[2], + iov_qws[3], + iov_qws[4], + iov_qws[5]); + iov_qws += 6; + } + + /* consume one credit for the rendezvous payload metadata */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + + uint64_t * replay_payload = replay->payload; + assert(!replay->use_iov); + assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); + uint64_t rem_payload_size; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_copy_cacheline(replay_payload, local_temp_payload); + replay_payload += FI_OPX_CACHE_LINE_QWS; + rem_payload_size = sizeof(struct fi_opx_hmem_iov) * (niov - 2); + } else { + local_temp[7] = local_temp_payload[0]; + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); + fi_opx_copy_cacheline(replay_payload, &local_temp_payload[1]); + replay_payload += 7; + rem_payload_size = (sizeof(struct fi_opx_hmem_iov) * (niov - 2) + 8); // overflow 8 bytes from 2nd cacheline + } + + if (payload_blocks_total > 1) { + assert(niov > 2); + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + iov_qws, + payload_blocks_total - 1, + total_credits_available); + + memcpy(replay_payload, iov_qws, rem_payload_size); + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + addr.reliability_rx, dest_rx, + psn_ptr, replay, reliability, + hfi1_type); + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, total_len, + lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + + return FI_SUCCESS; +} + +ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *user_context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface src_iface, + const uint64_t src_device_id, + const enum opx_hfi1_type hfi1_type) +{ + // We should already have grabbed the lock prior to calling this function + assert(!lock_required); + + //Need at least one full block of payload + assert(len >= FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES); + + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + + const uint64_t is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps); + +#ifndef NDEBUG + const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6) - 2; +#endif + /* Expected tid needs to send a leading data block and trailing data + * for alignment. TID writes must start on a 64-byte boundary, so we + * need to send 64 bytes of leading immediate data that allow us + * to shift the receive buffer starting offset to a TID-friendly value. + * TID writes must also be a length that is a multiple of a DW (WFR & JKR 9B) + * or a QW (JKR), so send the last 7 bytes of the source data immediately + * so we can adjust the length after proper alignment has been achieved. */ + const uint8_t immediate_block = (!is_intranode && opx_ep->use_expected_tid_rzv && + len >= opx_ep->tx->sdma_min_payload_bytes && + len >= opx_ep->tx->tid_min_payload_bytes) ? 1 : 0; + const uint8_t immediate_tail = immediate_block; + + assert(immediate_block <= 1); + assert(immediate_tail <= 1); + assert(immediate_block <= max_immediate_block_count); + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + + const uint8_t immediate_byte_count = (uint8_t) (len & 0x0007ul); + const uint8_t immediate_qw_count = (uint8_t) ((len >> 3) & 0x0007ul); + const uint8_t immediate_fragment = (uint8_t) (((len & 0x003Ful) + 63) >> 6); + assert(immediate_fragment == 1 || immediate_fragment == 0); + + /* Immediate total does not include trailing block */ + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block * sizeof(union cacheline); + + union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .count = (immediate_byte_count << OPX_IMMEDIATE_BYTE_COUNT_SHIFT) | + (immediate_qw_count << OPX_IMMEDIATE_QW_COUNT_SHIFT) | + (immediate_block << OPX_IMMEDIATE_BLOCK_SHIFT) | + (immediate_tail << OPX_IMMEDIATE_TAIL_SHIFT), + .tail_bytes = {} + }; + + assert(((len - immediate_total) & 0x003Fu) == 0); + + const uint64_t payload_blocks_total = + 1 + /* rzv metadata */ + immediate_fragment + + immediate_block; + + const uint64_t pbc_dws = + 2 + /* pbc */ + 2 + /* lhr */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (payload_blocks_total << 4); + + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + + if (is_intranode) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, SHM -- RENDEZVOUS RTS (begin) context %p\n", + user_context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, + opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"return %zd\n",rc); + return rc; + } + + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, + opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.rzv); + + hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); + + hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (1ull << 48); /* effectively 1 iov */ + hdr->qw_9B[5] = len; + hdr->qw_9B[6] = tag; + + union fi_opx_hfi1_packet_payload * const payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"hdr %p, payuload %p, sbuf %p, sbuf+immediate_total %p, immediate_total %#lX, adj len %#lX\n", + hdr, payload, + buf, ((char*)buf + immediate_total),immediate_total, (len - immediate_total)); + + struct opx_payload_rzv_contig *contiguous = &payload->rendezvous.contiguous; + payload->rendezvous.contig_9B_padding = 0; + contiguous->src_vaddr = (uintptr_t)buf + immediate_total; + contiguous->src_blocks = (len - immediate_total) >> 6; + contiguous->src_device_id = src_device_id; + contiguous->src_iface = (uint64_t) src_iface; + contiguous->immediate_info = immediate_info.qw0; + contiguous->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + contiguous->unused = 0; + + + if (immediate_total) { + uint8_t *sbuf; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf, immediate_total, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + sbuf = opx_ep->hmem_copy_buf; + } else { + sbuf = (uint8_t *) buf; + } + + for (int i = 0; i < immediate_byte_count; ++i) { + contiguous->immediate_byte[i] = sbuf[i]; + } + sbuf += immediate_byte_count; - hdr->qw[2] = opx_ep->tx->rzv.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->rzv.hdr.qw[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; - hdr->qw[5] = total_len; - hdr->qw[6] = tag; + uint64_t * sbuf_qw = (uint64_t *)sbuf; + for (int i = 0; i < immediate_qw_count; ++i) { + contiguous->immediate_qw[i] = sbuf_qw[i]; + } - union fi_opx_hfi1_packet_payload *const payload = - (union fi_opx_hfi1_packet_payload *)(hdr + 1); + if (immediate_block) { + sbuf_qw += immediate_qw_count; + uint64_t *payload_cacheline = + (uint64_t *)(&contiguous->cache_line_1 + immediate_fragment); + fi_opx_copy_cacheline(payload_cacheline, sbuf_qw); + } + } - payload->rendezvous.noncontiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; - struct fi_opx_hmem_iov *payload_iov = &payload->rendezvous.noncontiguous.iov[0]; - struct iovec *input_iov = (struct iovec *) iov; + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); - for (int i = 0; i < niov; i++) { -#ifdef OPX_HMEM - // TODO: desc is plumbed into this function as a single pointer - // only representing the first IOV. It should be changed - // to void ** to get an array of desc, one for each IOV. - // For now, just use the first iov's desc, assuming all - // the IOVs will reside in the same HMEM space. - FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_iface != FI_HMEM_SYSTEM, hmem_non_system); -#endif - payload_iov->buf = (uintptr_t) input_iov->iov_base; - payload_iov->len = input_iov->iov_len; - payload_iov->device = hmem_device; - payload_iov->iface = hmem_iface; - payload_iov++; - input_iov++; + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, + lock_required, tag, caps); } - FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_non_system, - opx_ep->debug_counters.hmem.intranode - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.rzv_noncontig); - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-NONCONTIG-SHM"); - FI_DBG_TRACE( - fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- RENDEZVOUS RTS (end) context %p\n",context); - fi_opx_shm_poll_many(&opx_ep->ep_fid, 0); return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-HFI"); + "===================================== SEND, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-HFI:%ld", tag); + + /* + * While the bulk of the payload data will be sent via SDMA once we + * get the CTS from the receiver, the initial RTS packet is sent via PIO. + */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t total_credits_needed = 1 + /* packet header */ - payload_blocks_total; /* packet payload */ - uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, total_credits_needed); + const uint16_t total_credits_needed = + 1 + /* packet header */ + payload_blocks_total; /* packet payload */ + + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, @@ -2945,139 +4190,222 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz } } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); - if(OFI_UNLIKELY(psn == -1)) { + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } - struct fi_opx_hmem_iov hmem_iov[FI_OPX_MAX_HMEM_IOV]; - unsigned hmem_niov = MIN(niov, FI_OPX_MAX_HMEM_IOV); - for (int i = 0; i < hmem_niov; ++i) { - hmem_iov[i].buf = (uintptr_t) iov[i].iov_base; - hmem_iov[i].len = iov[i].iov_len; -#ifdef OPX_HMEM - uint64_t device; - hmem_iov[i].iface = fi_opx_hmem_get_iface(iov[i].iov_base, desc, &device); - hmem_iov[i].device = device; - FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_iov[i].iface != FI_HMEM_SYSTEM, hmem_non_system); -#else - hmem_iov[i].iface = FI_HMEM_SYSTEM; - hmem_iov[i].device = 0; -#endif - } - FI_OPX_DEBUG_COUNTERS_INC_COND(hmem_non_system, - opx_ep->debug_counters.hmem.hfi + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.hfi .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.rzv_noncontig); + .send.rzv); - assert(opx_ep->tx->rzv.qw0 == 0); - const uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + if (immediate_tail) { + uint8_t *buf_tail_bytes = ((uint8_t *)buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf_tail_bytes = opx_ep->hmem_copy_buf; + } - volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; - - fi_opx_set_scb(scb, tmp, - opx_ep->tx->rzv.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->rzv.hdr.qw[1] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), - opx_ep->tx->rzv.hdr.qw[2] | psn, - opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->rzv.hdr.qw[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, - total_len, tag); + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + immediate_info.tail_bytes[i] = buf_tail_bytes[i]; + } + } - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + /* + * Write the 'start of packet' (hw+sw header) 'send control block' + * which will consume a single pio credit. + */ + + uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + uint64_t temp[8]; + + fi_opx_store_and_copy_qw(scb, temp, + opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), + opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (1ull << 48), + len, tag); /* consume one credit for the packet header */ - --total_credits_available; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG unsigned credits_consumed = 1; #endif - fi_opx_copy_cacheline(&replay->scb.qw0, tmp); + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - /* write the payload */ - uint64_t *iov_qws = (uint64_t *) &hmem_iov[0]; - volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, temp); - fi_opx_set_scb(scb_payload, tmp, - origin_byte_counter_vaddr, - iov_qws[0], - iov_qws[1], - iov_qws[2], - iov_qws[3], - iov_qws[4], - iov_qws[5], - iov_qws[6]); + /* + * write the rendezvous payload "send control blocks" + */ + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_and_copy_qw(scb_payload, temp, + 0, /* contig_9B_padding */ + (uintptr_t)buf + immediate_total, /* src_vaddr */ + (len - immediate_total) >> 6, /* src_blocks */ + src_device_id, + (uint64_t) src_iface, + immediate_info.qw0, + origin_byte_counter_vaddr, + 0 /* unused */); /* consume one credit for the rendezvous payload metadata */ - --total_credits_available; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG ++credits_consumed; #endif uint64_t * replay_payload = replay->payload; + assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, tmp); - replay_payload += 8; + fi_opx_copy_cacheline(replay_payload, temp); + replay_payload += FI_OPX_CACHE_LINE_QWS; - if (payload_blocks_total > 1) { - assert(niov > 2); + uint8_t *sbuf; + if (src_iface != FI_HMEM_SYSTEM && immediate_total) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf, immediate_total, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + sbuf = opx_ep->hmem_copy_buf; + } else { + sbuf = (uint8_t *) buf; + } + + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + /* immediate_byte and immediate_qw are "packed" in the current implementation */ + /* meaning the immediate bytes are filled, then followed by the rest of the data directly */ + /* adjacent to the packed bytes. It's probably more efficient to leave a pad and not go */ + /* through the confusion of finding these boundaries on both sides of the rendezvous */ + /* That is, just pack the immediate bytes, then pack the "rest" in the immediate qws */ + /* This would lead to more efficient packing on both sides at the expense of */ + /* wasting space of a common 0 byte immediate */ + /* tmp_payload_t represents the second cache line of the rts packet */ + /* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */ + struct tmp_payload_t { + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + } __attribute__((packed)); + + uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); + if (immediate_fragment) { + struct tmp_payload_t *tmp_payload = (void*)temp; + + for (int i = 0; i < immediate_byte_count; ++i) { + tmp_payload->immediate_byte[i] = sbuf[i]; + } + + for (int i = 0; i < immediate_qw_count; ++i) { + tmp_payload->immediate_qw[i] = sbuf_qw[i]; + } + fi_opx_store_scb_qw(scb_payload, temp); + sbuf_qw += immediate_qw_count; + + fi_opx_copy_cacheline(replay_payload, temp); + replay_payload += FI_OPX_CACHE_LINE_QWS; + /* consume one credit for the rendezvous payload immediate data */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG - credits_consumed += + ++credits_consumed; #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, - (uint64_t *) &hmem_iov[2], - payload_blocks_total - 1, - total_credits_available); + } + + if (immediate_block) { + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, sbuf_qw); + fi_opx_copy_cacheline(replay_payload, sbuf_qw); - memcpy(replay_payload, &hmem_iov[2], sizeof(struct fi_opx_hmem_iov) * (niov - 2)); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif } + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); #ifndef NDEBUG assert(credits_consumed == total_credits_needed); #endif - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - addr.uid.lid, - addr.reliability_rx, dest_rx, - psn_ptr, replay, reliability); - /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-HFI"); + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, lock_required, tag, caps); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-HFI:%ld",tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND, HFI -- RENDEZVOUS RTS (end) context %p\n", + user_context); return FI_SUCCESS; } -ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, +ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void* context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t dest_rx, - const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, - const uint64_t src_device_id) + const uint64_t src_device_id, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -3088,82 +4416,94 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; + const uint64_t is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps); + #ifndef NDEBUG const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6)-2 ; #endif - /* Expected tid needs to send a leading data block and a trailing - * data block for alignment. Limit this to SDMA (8K+) for now */ + /* Expected tid needs to send a leading data block and trailing data + * for alignment. TID writes must start on a 64-byte boundary, so we + * need to send 64 bytes of leading immediate data that allow us + * to shift the receive buffer starting offset to a TID-friendly value. + * TID writes must also be a length that is a multiple of a DW (WFR & JKR 9B) + * or a QW (JKR), so send the last 7 bytes of the source data immediately + * so we can adjust the length after proper alignment has been achieved. */ + const uint8_t immediate_block = (!is_intranode && opx_ep->use_expected_tid_rzv && + len >= opx_ep->tx->sdma_min_payload_bytes && + len >= opx_ep->tx->tid_min_payload_bytes) ? 1 : 0; + const uint8_t immediate_tail = immediate_block; + + assert(immediate_block <= 1); + assert(immediate_tail <= 1); + assert(immediate_block <= max_immediate_block_count); - const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " - "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", - immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len ); + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - const uint64_t immediate_end_block_count = immediate_block_count; + const uint8_t immediate_byte_count = (uint8_t) (len & 0x0007ul); + const uint8_t immediate_qw_count = (uint8_t) ((len >> 3) & 0x0007ul); + const uint8_t immediate_fragment = (uint8_t) (((len & 0x003Ful) + 63) >> 6); + assert(immediate_fragment == 1 || immediate_fragment == 0); - assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); + /* Need a full block for ICRC after the end block... */ + const uint64_t icrc_end_block = immediate_block; + + /* ... otherwise need a qw (or block) in the immediate fragment */ + const uint64_t icrc_fragment = icrc_end_block ? 0 : immediate_fragment; + + /* if there are already 7 qw's need a full block */ + const uint64_t icrc_fragment_block = icrc_fragment && (immediate_qw_count == 7) ? 1: 0 ; + + /* Summary: we can add the tail qw in... + * - rzv metadata if there is no other immediate data + * - an empty fragment qw if there are no other blocks (icrc_fragment & !icrc_fragment_block) + * - a full (additional) fragment block if there are no other blocks (icrc_fragment & icrc_fragment_block) + * - a full (additional) trailing block after the end (icrc_end_block) + */ - const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; - const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t immediate_byte_count = len & 0x0007ul; - const uint64_t immediate_qw_count = (len >> 3) & 0x0007ul; - const uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6); /* Immediate total does not include trailing block */ const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - - assert(immediate_byte_count <= UINT8_MAX); - assert(immediate_qw_count <= UINT8_MAX); - assert(immediate_block_count <= UINT8_MAX); - assert(immediate_end_block_count <= UINT8_MAX); + immediate_block * sizeof(union cacheline); union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .byte_count = (uint8_t) immediate_byte_count, - .qw_count = (uint8_t) immediate_qw_count, - .block_count = (uint8_t) immediate_block_count, - .end_block_count = (uint8_t) immediate_end_block_count, - .unused = 0 + .count = (immediate_byte_count << OPX_IMMEDIATE_BYTE_COUNT_SHIFT) | + (immediate_qw_count << OPX_IMMEDIATE_QW_COUNT_SHIFT) | + (immediate_block << OPX_IMMEDIATE_BLOCK_SHIFT) | + (immediate_tail << OPX_IMMEDIATE_TAIL_SHIFT), + .tail_bytes = {} }; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "max_immediate_block_count %#lX, len %#lX >> 6 %#lX, immediate_total %#lX, " - "immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX, " - "origin_byte_counter %lu/%#lX, adjusted origin_byte_counter %lu/%#lX\n", - max_immediate_block_count, len, (len >> 6), immediate_total, immediate_byte_count, - immediate_qw_count, immediate_block_count, *origin_byte_counter_value, - *origin_byte_counter_value, len - immediate_total, len - immediate_total); - + assert(icrc_end_block + icrc_fragment_block < 2); /* not both */ assert(((len - immediate_total) & 0x003Fu) == 0); - *origin_byte_counter_value = len - immediate_total; - + /* full blocks only. icrc_end_block/icrc_fragment_block count 1 qw only */ const uint64_t payload_blocks_total = - 1 + /* rzv metadata */ + 1 + /* last kdeth + rzv metadata */ immediate_fragment + - immediate_block_count + - immediate_end_block_count; + immediate_block; const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lhr */ + 4 + /* lhr */ 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_blocks_total << 4); + /* 9 + kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 7 + /* kdeth */ + (payload_blocks_total << 4) + /* includes last kdeth + metadata + immediate data */ + ((icrc_end_block | icrc_fragment_block) << 1); /* 1 QW of any added tail block */ - const uint16_t lrh_dws = htons(pbc_dws-1); + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ - if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + if (is_intranode) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -3173,38 +4513,57 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, return rc; } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.intranode .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); - hdr->qw[0] = opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20); + + hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); - hdr->qw[1] = opx_ep->tx->rzv.hdr.qw[1] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); - hdr->qw[2] = opx_ep->tx->rzv.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->rzv.hdr.qw[4] | (1ull << 48); /* effectively 1 iov */ - hdr->qw[5] = len; - hdr->qw[6] = tag; + hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (1ull << 48); /* effectively 1 iov */ + hdr->qw_16B[6] = len; + hdr->qw_16B[7] = tag; union fi_opx_hfi1_packet_payload * const payload = (union fi_opx_hfi1_packet_payload *)(hdr+1); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"hdr %p, payuload %p, sbuf %p, sbuf+immediate_total %p, immediate_total %#lX, adj len %#lX\n", - hdr, payload, - buf, ((char*)buf + immediate_total),immediate_total, (len - immediate_total)); - payload->rendezvous.contiguous.src_vaddr = (uintptr_t)buf + immediate_total; - payload->rendezvous.contiguous.src_blocks = (len - immediate_total) >> 6; - payload->rendezvous.contiguous.src_device_id = src_device_id; - payload->rendezvous.contiguous.src_iface = (uint64_t) src_iface; - payload->rendezvous.contiguous.immediate_info = immediate_info.qw0; - payload->rendezvous.contiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; - payload->rendezvous.contiguous.unused[0] = 0; - payload->rendezvous.contiguous.unused[1] = 0; + struct opx_payload_rzv_contig *contiguous = &payload->rendezvous.contiguous_16B; + contiguous->src_vaddr = (uintptr_t)buf + immediate_total; + contiguous->src_blocks = (len - immediate_total) >> 6; + contiguous->src_device_id = src_device_id; + contiguous->src_iface = (uint64_t) src_iface; + contiguous->immediate_info = immediate_info.qw0; + contiguous->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + contiguous->unused = 0; if (immediate_total) { @@ -3212,40 +4571,50 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, if (src_iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(src_iface, src_device_id, - desc_mr->hmem_dev_reg_handle, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; } - if (immediate_byte_count > 0) { - memcpy((void*)&payload->rendezvous.contiguous.immediate_byte, (const void*)sbuf, immediate_byte_count); - sbuf += immediate_byte_count; + for (int i = 0; i < immediate_byte_count; ++i) { + contiguous->immediate_byte[i] = sbuf[i]; } + sbuf += immediate_byte_count; uint64_t * sbuf_qw = (uint64_t *)sbuf; - unsigned i=0; - for (i=0; irendezvous.contiguous.immediate_qw[i] = sbuf_qw[i]; + for (int i = 0; i < immediate_qw_count; ++i) { + contiguous->immediate_qw[i] = sbuf_qw[i]; } - sbuf_qw += immediate_qw_count; - memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment), - (const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */ + if (immediate_block) { + sbuf_qw += immediate_qw_count; + uint64_t *payload_cacheline = + (uint64_t *)(&contiguous->cache_line_1 + immediate_fragment); + fi_opx_copy_cacheline(payload_cacheline, sbuf_qw); + } } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, + lock_required, tag, caps); + } return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-HFI:%ld", tag); /* @@ -3255,9 +4624,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t total_credits_needed = - 1 + /* packet header */ - payload_blocks_total; /* packet payload */ + const uint16_t total_credits_needed = (lrh_qws + 1 /* pbc */ + 7) >> 3 ; uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, @@ -3268,18 +4635,39 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, &opx_ep->tx->force_credit_return, total_credits_needed); if (total_credits_available < total_credits_needed) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); - if(OFI_UNLIKELY(psn == -1)) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } return -FI_EAGAIN; } @@ -3287,57 +4675,83 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); + if (immediate_tail) { + uint8_t *buf_tail_bytes = ((uint8_t *)buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf_tail_bytes = opx_ep->hmem_copy_buf; + } + + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + immediate_info.tail_bytes[i] = buf_tail_bytes[i]; + } + } + /* * Write the 'start of packet' (hw+sw header) 'send control block' * which will consume a single pio credit. */ - uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; - - fi_opx_set_scb(scb, tmp, - opx_ep->tx->rzv.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->rzv.hdr.qw[1] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), - opx_ep->tx->rzv.hdr.qw[2] | psn, - opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->rzv.hdr.qw[4] | (1ull << 48), - len, tag); + struct fi_opx_hfi1_txe_scb_16B tmp; + + fi_opx_store_and_copy_scb_16B(scb, &tmp, + opx_ep->tx->rzv_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20), + opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), + opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (1ull << 48), + len); + /* consume one credit for the packet header */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG unsigned credits_consumed = 1; #endif - + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - - fi_opx_copy_cacheline(&replay->scb.qw0, tmp); + tmp.hdr.qw_16B[7] = tag; + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, (uint64_t *)&tmp.qw0); /* * write the rendezvous payload "send control blocks" */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - - fi_opx_set_scb(scb_payload, tmp, - (uintptr_t)buf + immediate_total, /* src_vaddr */ - (len - immediate_total) >> 6, /* src_blocks */ - src_device_id, - (uint64_t) src_iface, - immediate_info.qw0, - origin_byte_counter_vaddr, - 0, 0 /* unused */); + uint64_t temp[8]; + + fi_opx_store_and_copy_qw(scb_payload, temp, + tag, /* end of header */ + /* start of receiver payload/cacheline */ + (uintptr_t)buf + immediate_total, /* rendezvous.contiguous.src_vaddr */ + (len - immediate_total) >> 6, /* rendezvous.contiguous.src_blocks */ + src_device_id, /* rendezvous.contiguous.src_device_id */ + (uint64_t) src_iface, /* rendezvous.contiguous.src_iface */ + immediate_info.qw0, /* rendezvous.contiguous.immediate_info */ + origin_byte_counter_vaddr, /* rendezvous.contiguous.origin_byte_counter_vaddr */ + -1UL /* unused */); /* rendezvous.contiguous.unused[0] */ /* consume one credit for the rendezvous payload metadata */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); #ifndef NDEBUG ++credits_consumed; #endif @@ -3346,22 +4760,32 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, tmp); - replay_payload += 8; + + /* temp is hdr (1 QW) + payload (7 QW) */ + replay_payload[0] = temp[1]; + replay_payload[1] = temp[2]; + replay_payload[2] = temp[3]; + replay_payload[3] = temp[4]; + replay_payload[4] = temp[5]; + replay_payload[5] = temp[6]; + replay_payload[6] = temp[7]; + + replay_payload += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, desc_mr->hmem_dev_reg_handle, + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; } - scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - /* immediate_byte and immediate_qw are "packed" in the current implementation */ /* meaning the immediate bytes are filled, then followed by the rest of the data directly */ /* adjacent to the packed bytes. It's probably more efficient to leave a pad and not go */ @@ -3372,79 +4796,80 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, /* tmp_payload_t represents the second cache line of the rts packet */ /* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */ struct tmp_payload_t { - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; + uint8_t immediate_byte[8]; /* rendezvous.contiguous.immediate_byte */ + uint64_t immediate_qw[7]; /* rendezvous.contiguous.immediate_qw */ } __attribute__((packed)); uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); if (immediate_fragment) { - struct tmp_payload_t *tmp_payload = (void*)tmp; - if (immediate_byte_count > 0) { - memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); + struct tmp_payload_t *tmp_payload = (void*)temp; + + for (int i = 0; i < immediate_byte_count; ++i) { + tmp_payload->immediate_byte[i] = sbuf[i]; } - for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; } - fi_opx_copy_scb(scb_payload, tmp); + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, temp); sbuf_qw += immediate_qw_count; - fi_opx_copy_cacheline(replay_payload, tmp); - replay_payload += 8; + fi_opx_copy_cacheline(replay_payload, temp); + replay_payload += FI_OPX_CACHE_LINE_QWS; /* consume one credit for the rendezvous payload immediate data */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG ++credits_consumed; #endif - } + /* Need a full tail block */ + if (icrc_fragment_block) { + /* No other tail or immediate block after this */ + assert(!icrc_end_block && !immediate_block); - if(immediate_block_count) { -#ifndef NDEBUG - /* assert immediate_block_count can be used for both - * full_block_credits_needed and total_credits_available parameters - * on the call - */ - assert((credits_consumed + immediate_block_count) <= total_credits_needed); - ssize_t credits = -#endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, - &pio_state, - sbuf_qw, - immediate_block_count, - immediate_block_count); - memcpy(replay_payload, sbuf_qw, (immediate_block_count << 6)); - /* replay_payload is pointer to uint64_t, not char */ - replay_payload += (immediate_block_count << 3); /* immediate_block_count << 6 / sizeof(uint64_t) */ + /* Write another block to accomodate the ICRC and tail */ + uint64_t temp_0[8] = {-2UL}; + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, temp_0); + fi_opx_copy_cacheline(replay_payload, temp_0); + replay_payload += FI_OPX_CACHE_LINE_QWS; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG - assert(credits == immediate_block_count); - credits_consumed+= (unsigned) credits; + ++credits_consumed; +#endif + } +#ifndef NDEBUG + else if (icrc_fragment) { /* used an immediate qw for tail */ + /* No other tail or immediate block after this */ + assert(!icrc_end_block && !immediate_block); + } else { + /* Must be tail and immediate blocks after this */ + assert(icrc_end_block && immediate_block); + } #endif } - if (immediate_end_block_count) { - char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE SEND RZV buf %p, buf end %p, sbuf immediate end block %p\n",(char *)buf, (char *)buf+len, sbuf_end); - union { - uint8_t immediate_byte[64]; - uint64_t immediate_qw[8]; - } align_tmp; - assert(immediate_end_block_count == 1); - - OPX_HMEM_COPY_FROM(align_tmp.immediate_byte, sbuf_end, (immediate_block_count << 6), - desc ? ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle - : OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_SEND_THRESHOLD, - src_iface, src_device_id); - - scb_payload = (uint64_t *)FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_copy_scb(scb_payload, align_tmp.immediate_qw); + if (immediate_block) { + /* Tail will be it's own block */ + assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment); + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, sbuf_qw); + fi_opx_copy_cacheline(replay_payload, sbuf_qw); + replay_payload += FI_OPX_CACHE_LINE_QWS; - fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); - replay_payload += 8; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + /* Write another block to accomodate the ICRC and tail */ + uint64_t temp_0[8] = {-3UL}; + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, temp_0); + fi_opx_copy_cacheline(replay_payload, temp_0); FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG @@ -3453,25 +4878,29 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, } fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - addr.uid.lid, addr.reliability_rx, - dest_rx, psn_ptr, replay, reliability); - - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); #ifndef NDEBUG assert(credits_consumed == total_credits_needed); #endif + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-HFI:%ld",tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (end) context %p\n", + user_context); return FI_SUCCESS; } - unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, volatile uint64_t *rhe_ptr, volatile uint32_t * rhf_ptr, @@ -3480,7 +4909,8 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { /* We are assuming that we can process any error and consume this header, let reliability detect and replay it as needed. */ @@ -3491,20 +4921,18 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, /* drop this packet and allow reliability protocol to retry */ #ifdef OPX_RELIABILITY_DEBUG - const uint64_t hdrq_offset_dws = (rhf_msb >> 12) & 0x01FFu; - fprintf(stderr, - "%s:%s():%d drop this packet and allow reliability protocol to retry, psn = %u, RHF %#16.16lX, OPX_RHF_IS_USE_EGR_BUF %u, hdrq_offset_dws %lu\n", + "%s:%s():%d drop this packet and allow reliability protocol to retry, psn = %u, RHF %#16.16lX, OPX_RHF_IS_USE_EGR_BUF %u, hdrq_offset %lu\n", __FILE__, __func__, __LINE__, FI_OPX_HFI1_PACKET_PSN(hdr), - rhf_rcvd, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), hdrq_offset_dws); + rhf_rcvd, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd,hfi1_type), hdrq_offset); #endif - OPX_RHE_DEBUG(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr); + OPX_RHE_DEBUG(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr, hfi1_type); - if (OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd)) { + if (OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd,hfi1_type)) { /* "consume" this egrq element */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf_rcvd); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -3515,7 +4943,7 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, } /* "consume" this hdrq element */ - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); diff --git a/prov/opx/src/fi_opx_hfi1_jkr.c b/prov/opx/src/fi_opx_hfi1_jkr.c index 3b2f714435b..7f44341c09c 100644 --- a/prov/opx/src/fi_opx_hfi1_jkr.c +++ b/prov/opx/src/fi_opx_hfi1_jkr.c @@ -42,7 +42,8 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { uint32_t rhe_index = hdrq_offset >> FI_OPX_HFI1_HDRQ_INDEX_SHIFT; volatile uint64_t *rhe = rhe_ptr + rhe_index; /* 8 byte entries */ @@ -53,13 +54,13 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, #endif "RHF(%#16.16lX) RHE(%p)[%u]=%p RHE %#16.16lX is ERRORED %u, UseEgrBuf %u, EgrIndex %#X/%#X, EgrOffset %#X, %s%s%s %s %#16.16lX %s%s%s%s%s%s%s%s%s%s%s \n", rhf_rcvd, rhe_ptr, rhe_index, rhe, *rhe, - OPX_IS_ERRORED_RHF(rhf_rcvd) != 0UL, - OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), - (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd),opx_ep->rx->egrq.last_egrbfr_index, - (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : "", + OPX_IS_ERRORED_RHF(rhf_rcvd, hfi1_type) != 0UL, + OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), + (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type),opx_ep->rx->egrq.last_egrbfr_index, + (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd, hfi1_type), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : "", ((*rhe) & OPX_JKR_RHE_TAIL )? "OPX_JKR_RHE_TAIL " : "", ((*rhe) & OPX_JKR_RHE_TAIL), ((*rhe) & OPX_JKR_RHE_ICRCERR )? "OPX_JKR_RHE_ICRCERR " : "", ((*rhe) & OPX_JKR_RHE_TIDBYPASSERR)? "OPX_JKR_RHE_TIDBYPASSERR" : "", @@ -86,12 +87,12 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND((*rhe) & OPX_JKR_RHE_FLOWSEQERR ,opx_ep->debug_counters.rhf.flowseqerr); FI_OPX_DEBUG_COUNTERS_INC_COND((*rhe) & OPX_JKR_RHE_RCVTYPEERR ,opx_ep->debug_counters.rhf.rcvtypeerr); /* Count the packet type that had an error */ - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeexp); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeegr); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeoth); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeexp); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeegr); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeoth); #ifdef OPX_VERBOSE_TRIGGER // verbose output - fi_opx_hfi1_dump_packet_hdr (hdr, "OPX_IS_ERRORED_RHF", __LINE__); + fi_opx_hfi1_dump_packet_hdr(hdr, hfi1_type, "OPX_IS_ERRORED_RHF", __LINE__); #endif /* trigger on unexpected errors ) ignoring TIDERR */ @@ -102,10 +103,10 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, } -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { - const uint8_t opcode = hdr->stl.bth.opcode; - + const uint8_t opcode = hdr->bth.opcode; #ifdef OPX_VERBOSE_TRIGGER // verbose output fprintf(stderr, #else @@ -113,14 +114,96 @@ int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_p #endif "%s:%s():%d MISSING PAYLOAD opcode %#X, UseEgrBuf %u, pktlen %#X, type: %s%s%s\n", __FILE__, __func__, __LINE__, - opcode, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), ntohs(hdr->stl.lrh.pktlen), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : ""); + opcode, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), ntohs(hdr->lrh_9B.pktlen), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : ""); #ifdef OPX_VERBOSE_TRIGGER // verbose ouput - fi_opx_hfi1_dump_packet_hdr (hdr, "MISSING PAYLOAD", __LINE__); + fi_opx_hfi1_dump_packet_hdr (hdr, OPX_HFI1_JKR, "MISSING PAYLOAD", __LINE__); #endif opx_sw_trigger(); return 1; } +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func) +{ + __attribute__((__unused__)) union opx_jkr_pbc pbc; + pbc.raw64b = pbc1; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc = %#16.16lX\n", func, pbc1); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.LengthDWs = %#x %zu\n", func, pbc.LengthDWs, pbc.LengthDWs * sizeof(uint32_t)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Vl = %#x\n", func, pbc.Vl); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.PortIdx = %#x\n", func, pbc.PortIdx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Reserved_2 = %#x\n", func, pbc.Reserved_2); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.L2Compressed = %#x\n", func, pbc.L2Compressed); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.L2Type = %#x\n", func, pbc.L2Type); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Fecnd = %#x\n", func, pbc.Fecnd); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.TestBadLcrc = %#x\n", func, pbc.TestBadLcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.InsertNon9bIcrc = %#x\n", func, pbc.InsertNon9bIcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.CreditReturn = %#x\n", func, pbc.CreditReturn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.InsertHcrc = %#x\n", func, pbc.InsertHcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Reserved_1 = %#x\n", func, pbc.Reserved_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.TestEbp = %#x\n", func, pbc.TestEbp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Sc4 = %#x\n", func, pbc.Sc4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Intr = %#x\n", func, pbc.Intr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Dlid = %#x %u\n", func, pbc.Dlid, pbc.Dlid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.SendCtxt = %#x\n", func, pbc.SendCtxt); +} + + +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func) +{ + __attribute__((__unused__)) union opx_hfi1_packet_hdr hdr; + hdr.lrh_16B.qw[0] = lrh1; + hdr.lrh_16B.qw[1] = lrh2; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH = %#16.16lX\n", func, lrh1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH = %#16.16lX\n", func, lrh2); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.slid = %#x\n", func, hdr.lrh_16B.slid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.pktlen = %#x %zu\n", func, hdr.lrh_16B.pktlen, hdr.lrh_16B.pktlen * sizeof(uint64_t)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.b = %#x\n", func, hdr.lrh_16B.b); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.dlid = %#x\n", func, hdr.lrh_16B.dlid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.sc = %#x\n", func, hdr.lrh_16B.sc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.rc = %#x\n", func, hdr.lrh_16B.rc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.f = %#x\n", func, hdr.lrh_16B.f); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.l2 = %#x\n", func, hdr.lrh_16B.l2); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.lt = %#x\n", func, hdr.lrh_16B.lt); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.l4 = %#x\n", func, hdr.lrh_16B.l4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.slid20 = %#x\n", func, hdr.lrh_16B.slid20); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.dlid20 = %#x\n", func, hdr.lrh_16B.dlid20); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.pkey = %#x\n", func, hdr.lrh_16B.pkey); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.entropy = %#x\n", func, hdr.lrh_16B.entropy); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.age = %#x\n", func, hdr.lrh_16B.age); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.cspec = %#x\n", func, hdr.lrh_16B.cspec); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.r = %#x\n", func, hdr.lrh_16B.r); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.SLID(full) = %#6.6x (BE format = %#6.6x)\n", func, hdr.lrh_16B.slid20 << 20 | hdr.lrh_16B.slid, htons(((hdr.lrh_16B.slid20 << 20) | hdr.lrh_16B.slid))); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.DLID(full) = %#6.6x (BE format = %#6.6x)\n", func, hdr.lrh_16B.dlid20 << 20 | hdr.lrh_16B.dlid, htons(((hdr.lrh_16B.dlid20 << 20) | hdr.lrh_16B.dlid))); + + +} + + +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func) +{ + __attribute__((__unused__)) union opx_hfi1_packet_hdr hdr; + hdr.qw_16B[2] = bth1; + hdr.qw_16B[3] = bth2; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH = %#16.16lX\n", func, bth1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH = %#16.16lX\n", func, bth2); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.opcode = %#x\n", func, hdr.bth.opcode); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.bth_1 = %#x\n", func, hdr.bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.pkey = %#x\n", func, hdr.bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.ecn = %#x\n", func, hdr.bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.qp = %#x\n", func, hdr.bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.unused = %#x\n", func, hdr.bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.rx = %#x\n", func, hdr.bth.rx); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.psn = %#x\n", func, hdr.bth.psn); +} diff --git a/prov/opx/src/fi_opx_hfi1_sdma.c b/prov/opx/src/fi_opx_hfi1_sdma.c index 32367500684..7d4b100a144 100644 --- a/prov/opx/src/fi_opx_hfi1_sdma.c +++ b/prov/opx/src/fi_opx_hfi1_sdma.c @@ -94,7 +94,7 @@ int fi_opx_hfi1_dput_sdma_pending_completion(union fi_opx_hfi1_deferred_work *wo FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } - assert(we->comp_state == OPX_SDMA_COMP_COMPLETE); + assert(we->comp_state == OPX_SDMA_COMP_COMPLETE || we->comp_state == OPX_SDMA_COMP_ERROR); slist_remove_head(¶ms->sdma_reqs); we->next = NULL; @@ -143,6 +143,14 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, { const pid_t pid = getpid(); + if (errno == ECOMM || errno == EINTR) { + int err = fi_opx_context_check_status(opx_ep->hfi); + if (err != FI_SUCCESS) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Link down detected\n"); + return; + } + } + fprintf(stderr, "(%d) %s:%s():%d ERROR: SDMA Abort errno=%d (%s)\n", pid, file, func, line, errno, strerror(errno)); fprintf(stderr, "(%d) ===================================== SDMA_WE -- " @@ -212,9 +220,9 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, } #endif fprintf(stderr, "(%d) [%d] PBC: %#16.16lX\n", - pid, req_num, header_vec->scb.qw0); + pid, req_num, header_vec->scb.scb_9B.qw0); - fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.hdr, func, line); + fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.scb_9B.hdr, OPX_HFI1_TYPE, func, line); fprintf(stderr, "(%d) [%d] req data iov=%p len=%lu\n", pid, req_num, iov_ptr[1].iov_base, iov_ptr[1].iov_len); @@ -246,7 +254,7 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, "(%d) [%d] ERROR: Request opcode is set to EXPECTED (TID), but TID IOV's length is < minimum!\n", pid, req_num); } - uint32_t kdeth = (uint32_t) (header_vec->scb.hdr.qw[2] >> 32); + uint32_t kdeth = (uint32_t) (header_vec->scb.scb_9B.hdr.qw_9B[2] >> 32); uint32_t tidctrl = (kdeth >> FI_OPX_HFI1_KDETH_TIDCTRL_SHIFT) & FI_OPX_HFI1_KDETH_TIDCTRL; uint32_t tididx = (kdeth >> FI_OPX_HFI1_KDETH_TID_SHIFT) & FI_OPX_HFI1_KDETH_TID; uint32_t tidOMshift = (kdeth >> KDETH_OM_SHIFT) & KDETH_OM_MASK; @@ -315,7 +323,7 @@ void opx_hfi1_sdma_process_pending(struct fi_opx_ep *opx_ep) } __OPX_FORCE_INLINE__ -void opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, +int opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, struct iovec *iovecs, int iovs_used, uint16_t avail, @@ -356,6 +364,7 @@ void opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.writev_calls[iovs_used]); + return(writev_rc); } void opx_hfi1_sdma_process_requests(struct fi_opx_ep *opx_ep) @@ -390,9 +399,13 @@ void opx_hfi1_sdma_process_requests(struct fi_opx_ep *opx_ep) if (iovs_free < request->num_iovs) #endif { - opx_hfi1_sdma_writev(opx_ep, iovecs, iovs_used, avail, + int err = opx_hfi1_sdma_writev(opx_ep, iovecs, iovs_used, avail, fill_index, __FILE__, __func__, __LINE__); - + if (err < 0) { + /* Error occured in writev. Add the request back to queue */ + slist_insert_head((struct slist_entry *)request, &queue->list); + return; + } iovs_used = 0; iovs_free = OPX_SDMA_HFI_MAX_IOVS_PER_WRITE; } diff --git a/prov/opx/src/fi_opx_hfi1_wfr.c b/prov/opx/src/fi_opx_hfi1_wfr.c index fb9388703f0..7e9ffef1aa3 100644 --- a/prov/opx/src/fi_opx_hfi1_wfr.c +++ b/prov/opx/src/fi_opx_hfi1_wfr.c @@ -41,7 +41,8 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { #ifdef OPX_VERBOSE_TRIGGER // verbose output fprintf(stderr, @@ -50,13 +51,13 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, #endif "RHF(%#16.16lX) RHE %#8.8X is ERRORED %u, UseEgrBuf %u, EgrIndex %#X/%#X, EgrOffset %#X, %s%s%s %s%s%s%s%s%s%s%s \n", rhf_rcvd, rhf_msb & 0xBFE00000u, - OPX_IS_ERRORED_RHF(rhf_rcvd) != 0UL, - OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), - (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd),opx_ep->rx->egrq.last_egrbfr_index, - (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : "", + OPX_IS_ERRORED_RHF(rhf_rcvd, hfi1_type) != 0UL, + OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), + (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type),opx_ep->rx->egrq.last_egrbfr_index, + (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd, hfi1_type), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : "", rhf_msb & OPX_WFR_RHF_ICRCERR? "OPX_WFR_RHF_ICRCERR" :"", rhf_msb & OPX_WFR_RHF_LENERR? "OPX_WFR_RHF_LENERR" :"", rhf_msb & OPX_WFR_RHF_ECCERR? "OPX_WFR_RHF_ECCERR" :"", @@ -76,12 +77,12 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND(rhf_msb & OPX_WFR_RHF_KHDRLENERR,opx_ep->debug_counters.rhf.khdrlenerr); FI_OPX_DEBUG_COUNTERS_INC_COND(rhf_msb & OPX_WFR_RHF_RCVTYPEERR,opx_ep->debug_counters.rhf.rcvtypeerr); /* Count the packet type that had an error */ - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeexp); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeegr); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeoth); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeexp); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeegr); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeoth); #ifdef OPX_VERBOSE_TRIGGER // verbose output - fi_opx_hfi1_dump_packet_hdr (hdr, "OPX_IS_ERRORED_RHF", __LINE__); + fi_opx_hfi1_dump_packet_hdr(hdr, hfi1_type, "OPX_IS_ERRORED_RHF", __LINE__); #endif return; diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 8e005c83a4b..7967fdcf13b 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -59,7 +59,7 @@ union fi_opx_addr opx_default_addr = { .hfi1_rx = 0, .hfi1_unit = 0xff, .reliability_rx = 0, - .uid = { .lid = 0xffff, .endpoint_id = 0xffff }, + .uid = {.lid = 0xffff, .lid_3B = 0xff, .endpoint_id = 0xff }, .rx_index = 0, }; @@ -158,7 +158,7 @@ int fi_opx_check_info(const struct fi_info *info) static int fi_opx_fillinfo(struct fi_info *fi, const char *node, const char* service, const struct fi_info *hints, - uint64_t flags, enum fi_progress progress) + uint64_t flags, enum fi_progress progress) { int ret; uint64_t caps; @@ -172,19 +172,11 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, if (!hints && !node && !service) goto err; - if (hints && (((hints->mode & FI_CONTEXT) != 0) && ((hints->mode & FI_CONTEXT2) == 0))) { - FI_WARN(fi_opx_global.prov, FI_LOG_FABRIC, - "FI_CONTEXT mode is not supported. Use FI_CONTEXT2 mode instead.\n"); - errno = FI_ENODATA; - return -errno; - } - fi->next = NULL; fi->caps = FI_OPX_DEFAULT_CAPS; /* set the mode that we require */ fi->mode = FI_ASYNC_IOV; - fi->mode |= (FI_CONTEXT2); fi->addr_format = FI_ADDR_OPX; fi->src_addrlen = 0; @@ -195,7 +187,7 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, // Process the node field. Service is treated identically to node. if (node) { if (!ofi_str_toaddr(node, &fmt, (void **)&addr, &len) && - fmt == FI_ADDR_OPX) { + fmt == FI_ADDR_OPX) { if (flags & FI_SOURCE) { fi->src_addr = addr; fi->src_addrlen = sizeof(union fi_opx_addr); @@ -404,6 +396,9 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, fi->ep_attr->rx_ctx_cnt = hints->ep_attr->rx_ctx_cnt; /* TODO - check */ } + fi->nic = ofi_nic_dup(NULL); + fi->nic->bus_attr->bus_type = FI_BUS_PCI; + return 0; err: @@ -649,10 +644,10 @@ struct fi_provider fi_opx_provider = { */ static void do_static_assert_tests() { - // Verify that pio_state is exactly one cache-line long. */ + // Verify that pio_state is exactly one cache-line long. OPX_COMPILE_TIME_ASSERT((sizeof(union fi_opx_hfi1_pio_state) == 8), "fi_opx_hfi1_pio_state size error."); - // Verify that pointers are exactly one cache-line long. */ + // Verify that pointers are exactly one cache-line long. OPX_COMPILE_TIME_ASSERT((sizeof(union fi_opx_hfi1_pio_state*) == 8), "fi_opx_hfi1_pio_state pointer size error."); @@ -660,23 +655,16 @@ static void do_static_assert_tests() OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->tid_cts), "Expected TID rendezvous CTS payload size error"); - OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->rendezvous.contiguous), + OPX_COMPILE_TIME_ASSERT(sizeof(*payload) >= sizeof(payload->rendezvous.contiguous), "Contiguous rendezvous payload size error"); OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->rendezvous.noncontiguous), "Non-contiguous rendezvous payload size error"); - OPX_COMPILE_TIME_ASSERT(sizeof(struct fi_context2) == sizeof(union fi_opx_context), - "fi_opx_context size error"); - - OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_context_ext) & 0x1F) == 0, - "sizeof(fi_opx_context_ext) should be a multiple of 32") ; OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_hmem_info) >> 3) == OPX_HMEM_SIZE_QWS, "sizeof(fi_opx_hmem_info) >> 3 != OPX_HMEM_SIZE_QWS") ; OPX_COMPILE_TIME_ASSERT(OPX_HFI1_TID_PAGESIZE == 4096, "OPX_HFI1_TID_PAGESIZE must be 4K!"); - OPX_COMPILE_TIME_ASSERT(OPX_MR != OFI_MR_UNSPEC, - "OPX_MR should be set to 'FI_MR_SCALABLE' or 'FI_MR_BASIC', not 'FI_MR_UNSPEC'"); } #pragma GCC diagnostic pop @@ -703,10 +691,12 @@ OPX_INI fi_opx_init = 1; - fi_param_define(&fi_opx_provider, "uuid", FI_PARAM_STRING, "Globally unique ID for preventing OPX jobs from conflicting either in shared memory or over the OPX fabric. Defaults to \"%s\"", + fi_param_define(&fi_opx_provider, "uuid", FI_PARAM_STRING, "Globally unique ID for preventing OPX jobs from conflicting either in shared memory or over the OPX fabric. Defaults to the Slurm job ID if one exists, otherwise defaults to Intel MPI UUID if one exists, otherwise defaults to \"%s\"", OPX_DEFAULT_JOB_KEY_STR); fi_param_define(&fi_opx_provider, "force_cpuaffinity", FI_PARAM_BOOL, "Causes the thread to bind itself to the cpu core it is running on. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "reliability_service_usec_max", FI_PARAM_INT, "The number of microseconds between pings for un-acknowledged packets. Defaults to 500 usec."); + fi_param_define(&fi_opx_provider, "reliability_max_uncongested_pings", FI_PARAM_INT, "The maximum number of reliability pings sent in a single timer iteration when the network link is uncongested. Value must be between %d and %d. Defaults to %d.", OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + fi_param_define(&fi_opx_provider, "reliability_max_congested_pings", FI_PARAM_INT, "The maximum number of reliability pings sent in a single timer iteration when the network link is congested. Value must be between %d and %d. Defaults to %d.", OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); fi_param_define(&fi_opx_provider, "reliability_service_pre_ack_rate", FI_PARAM_INT, "The number of packets to receive from a particular sender before preemptively acknowledging them without waiting for a ping. Valid values are powers of 2 in the range of 0-32,768, where 0 indicates no preemptive acking. Defaults to 64."); fi_param_define(&fi_opx_provider, "selinux", FI_PARAM_BOOL, "Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system settings. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "hfi_select", FI_PARAM_STRING, "Overrides the normal algorithm used to choose which HFI a process will use. See the documentation for more information."); @@ -716,6 +706,9 @@ OPX_INI fi_param_define(&fi_opx_provider, "sdma_bounce_buf_threshold", FI_PARAM_INT, "The maximum message length in bytes that will be copied to the SDMA bounce buffer. For messages larger than this threshold, the send will not be completed until receiver has ACKed. Value must be between %d and %d. Defaults to %d.", OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, OPX_SDMA_BOUNCE_BUF_THRESHOLD); fi_param_define(&fi_opx_provider, "sdma_disable", FI_PARAM_INT, "Disables SDMA offload hardware. Default is 0"); fi_param_define(&fi_opx_provider, "sdma_min_payload_bytes", FI_PARAM_INT, "The minimum message length in bytes where SDMA will be used. For messages smaller than this threshold, the send will be completed using PIO. Value must be between %d and %d. Defaults to %d.", FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT); + fi_param_define(&fi_opx_provider, "tid_min_payload_bytes", FI_PARAM_INT, + "The minimum message length in bytes where TID will be used. Value must be >= %d. Defaults to %d.", + OPX_TID_MIN_PAYLOAD_BYTES_MIN, OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT); fi_param_define(&fi_opx_provider, "expected_receive_enable", FI_PARAM_BOOL, "Enables expected receive rendezvous using Token ID (TID). Defaults to \"No\"."); fi_param_define(&fi_opx_provider, "prog_affinity", FI_PARAM_STRING, "When set, specify the set of CPU cores to set the progress " @@ -738,6 +731,7 @@ OPX_INI /* CN5000 only */ fi_param_define(&fi_opx_provider, "rate_control", FI_PARAM_INT,"Rate control (CN5000 only). Values can range from 0-7. 0-3 is used for in-order and 4-7 is used for out-of-order. Default is %d\n", OPX_BTH_RC2_DEFAULT); // fi_param_define(&fi_opx_provider, "varname", FI_PARAM_*, "help"); + fi_param_define(&fi_opx_provider, "mixed_network", FI_PARAM_INT, "Indicates a mixed network of OPA100 and CN5000. Needs to be set to 1 when mixed network is used. Default is 0.\n"); /* Track TID and HMEM domains so caches can be cleared on exit */ dlist_init(&fi_opx_global.tid_domain_list); diff --git a/prov/opx/src/fi_opx_msg.c b/prov/opx/src/fi_opx_msg.c index 3674366c32a..0ec84a5a183 100644 --- a/prov/opx/src/fi_opx_msg.c +++ b/prov/opx/src/fi_opx_msg.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,8 +42,6 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -61,7 +59,8 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } return fi_opx_ep_tx_send(ep, msg->msg_iov, msg->iov_count, @@ -72,15 +71,14 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -98,7 +96,8 @@ ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, OPX_FLAGS_OVERRIDE_FALSE, 0, /* flags */ caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } return fi_opx_ep_tx_send(ep, iov, count, desc, dest_addr, 0, context, 0, @@ -108,167 +107,176 @@ ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, OPX_FLAGS_OVERRIDE_FALSE, 0, /* flags */ caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } ssize_t fi_opx_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - errno = FI_ENOSYS; return -errno; } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - - -#define FI_OPX_MSG_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) - -#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ops_msg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY - -#define FI_OPX_MSG_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ -static struct fi_ops_msg \ - FI_OPX_MSG_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) __attribute__ ((unused)) = { \ - .size = sizeof(struct fi_ops_msg), \ - .recv = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, LOCK, AV, CAPS, RELIABILITY), \ - .recvv = fi_no_msg_recvv, \ - .recvmsg = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, LOCK, AV, CAPS, RELIABILITY), \ - .send = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, LOCK, AV, CAPS, RELIABILITY), \ - .sendv = fi_opx_sendv, \ - .sendmsg = fi_opx_sendmsg, \ - .inject = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, LOCK, AV, CAPS, RELIABILITY), \ - .senddata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, LOCK, AV, CAPS, RELIABILITY), \ - .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY),\ +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) + + + + +#define FI_OPX_MSG_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) + +#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ops_msg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE + +#define FI_OPX_MSG_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ +static struct fi_ops_msg \ + FI_OPX_MSG_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY, HFI1_TYPE) __attribute__ ((unused)) = { \ + .size = sizeof(struct fi_ops_msg), \ + .recv = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .recvv = fi_no_msg_recvv, \ + .recvmsg = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .send = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .sendv = fi_opx_sendv, \ + .sendmsg = fi_opx_sendmsg, \ + .inject = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .senddata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); static struct fi_ops_msg fi_opx_no_msg_ops = { @@ -336,110 +344,146 @@ int fi_opx_enable_msg_ops(struct fid_ep *ep) const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); const enum ofi_reliability_kind reliability = opx_ep->reliability->state.kind; + if (OFI_UNLIKELY(reliability != OFI_RELIABILITY_KIND_ONLOAD)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Invalid reliability kind %u\n", reliability); + return -FI_EINVAL; + } - if (!lock_required) { - if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B){ + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } } else { - if (opx_ep->av->type == FI_AV_TABLE) { - - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } } diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index cf3e538d3ca..bda8429269e 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -421,15 +421,17 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1) { + if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; return -FI_EAGAIN; } @@ -437,31 +439,59 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = reliability_rx << 56; - struct fi_opx_hfi1_txe_scb model = opx_ep->reliability->service.tx.hfi1.ping_model; - model.hdr.ud.opcode = opcode; volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - OPX_HFI1_BAR_STORE(&scb[0], (model.qw0 | OPX_PBC_CR(0x1) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid))); - OPX_HFI1_BAR_STORE(&scb[1], (model.hdr.qw[0] | lrh_dlid)); - OPX_HFI1_BAR_STORE(&scb[2], (model.hdr.qw[1] | bth_rx)); - OPX_HFI1_BAR_STORE(&scb[3], model.hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], model.hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], 0UL); - OPX_HFI1_BAR_STORE(&scb[6], 0UL); - OPX_HFI1_BAR_STORE(&scb[7], key); - - - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + if ((hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { + struct fi_opx_hfi1_txe_scb_9B model_9B = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + model_9B.hdr.ud.opcode = opcode; + OPX_HFI1_BAR_STORE(&scb[0], (model_9B.qw0 | OPX_PBC_CR(0x1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_9B.hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model_9B.hdr.qw_9B[1] | bth_rx)); + OPX_HFI1_BAR_STORE(&scb[3], model_9B.hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], model_9B.hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], 0UL); + OPX_HFI1_BAR_STORE(&scb[6], 0UL); + OPX_HFI1_BAR_STORE(&scb[7], key); + + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } else { + struct fi_opx_hfi1_txe_scb_16B model_16B = opx_ep->reliability->service.tx.hfi1.ping_model_16B; + model_16B.hdr.ud.opcode = opcode; + OPX_HFI1_BAR_STORE(&scb[0], (model_16B.qw0 | OPX_PBC_CR(1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B.hdr.qw_16B[0] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B.hdr.qw_16B[1] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B.hdr.qw_16B[2] | bth_rx); + OPX_HFI1_BAR_STORE(&scb[4], model_16B.hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], model_16B.hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], 0UL); + OPX_HFI1_BAR_STORE(&scb[7], 0UL); + FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + volatile uint64_t * const scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + OPX_HFI1_BAR_STORE(&scb_payload[0], key); + OPX_HFI1_BAR_STORE(&scb_payload[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[7], OPX_JKR_16B_PAD_QWORD); + FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); + } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return FI_SUCCESS; } @@ -471,7 +501,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, void opx_reliability_handshake_init(struct fid_ep *ep, union fi_opx_reliability_service_flow_key key, - const uint64_t target_reliability_rx) + const uint64_t target_reliability_rx, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -481,7 +512,8 @@ void opx_reliability_handshake_init(struct fid_ep *ep, fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.dlid, target_reliability_rx, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT, + hfi1_type); uint64_t value = 1; rbtInsert(opx_ep->reliability->service.handshake_init, (void*)key.value, (void*)value); @@ -495,7 +527,8 @@ void opx_reliability_handshake_init(struct fid_ep *ep, fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.dlid, target_reliability_rx, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT, + hfi1_type); } (*count_ptr)++; @@ -505,11 +538,12 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_init(struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { assert(opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT || opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK); - ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode); + ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode, hfi1_type); if (OFI_UNLIKELY(rc)) { #ifdef OPX_RELIABILITY_DEBUG @@ -542,7 +576,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, { assert(opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH || opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH_ACK); - ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode); + ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode, OPX_HFI1_TYPE); if (OFI_UNLIKELY(rc)) { #ifdef OPX_RELIABILITY_DEBUG @@ -570,7 +604,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -580,9 +615,10 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t psn_start_24 = psn_start & MAX_PSN; const uint64_t psn_count_24 = MIN(psn_count, MAX_PSN-psn_start_24 + 1); - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1) { + if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed) { /* * no credits available @@ -602,6 +638,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, } #endif opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } @@ -639,44 +676,83 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = reliability_rx << 56; - const struct fi_opx_hfi1_txe_scb * const model = + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const struct fi_opx_hfi1_txe_scb_9B * const model = opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING ? - &opx_ep->reliability->service.tx.hfi1.ping_model : + &opx_ep->reliability->service.tx.hfi1.ping_model_9B : ( opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? - &opx_ep->reliability->service.tx.hfi1.ack_model : - &opx_ep->reliability->service.tx.hfi1.nack_model ); + &opx_ep->reliability->service.tx.hfi1.ack_model_9B : + &opx_ep->reliability->service.tx.hfi1.nack_model_9B ); - OPX_HFI1_BAR_STORE(&scb[0], (model->qw0 | OPX_PBC_CR(0x1) | - OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(lrh_dlid))); - OPX_HFI1_BAR_STORE(&scb[1], (model->hdr.qw[0] | lrh_dlid)); - OPX_HFI1_BAR_STORE(&scb[2], (model->hdr.qw[1] | bth_rx)); - OPX_HFI1_BAR_STORE(&scb[3], model->hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], model->hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], psn_count_24); - OPX_HFI1_BAR_STORE(&scb[6], psn_start_24); - OPX_HFI1_BAR_STORE(&scb[7], key); /* service.key */ + OPX_HFI1_BAR_STORE(&scb[0], (model->qw0 | OPX_PBC_CR(0x1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model->hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model->hdr.qw_9B[1] | bth_rx)); + OPX_HFI1_BAR_STORE(&scb[3], model->hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], model->hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], psn_count_24); + OPX_HFI1_BAR_STORE(&scb[6], psn_start_24); + OPX_HFI1_BAR_STORE(&scb[7], key); /* service.key */ - //fi_opx_hfi1_dump_stl_packet_hdr((struct fi_opx_hfi1_stl_packet_hdr *)&tmp[1], __func__, __LINE__); + //fi_opx_hfi1_dump_stl_packet_hdr((struct fi_opx_hfi1_stl_packet_hdr_9B *)&tmp[1], __func__, __LINE__); - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } else { + const struct fi_opx_hfi1_txe_scb_16B * const model_16B = + opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING ? + &opx_ep->reliability->service.tx.hfi1.ping_model_16B : + ( opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? + &opx_ep->reliability->service.tx.hfi1.ack_model_16B : + &opx_ep->reliability->service.tx.hfi1.nack_model_16B ); + OPX_HFI1_BAR_STORE(&scb[0], (model_16B->qw0 | OPX_PBC_CR(1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B->hdr.qw_16B[0] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B->hdr.qw_16B[1] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B->hdr.qw_16B[2] | bth_rx); + OPX_HFI1_BAR_STORE(&scb[4], model_16B->hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], model_16B->hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], psn_count_24); + OPX_HFI1_BAR_STORE(&scb[7], psn_start_24); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, "Completed cacheline 1\n"); + + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + OPX_HFI1_BAR_STORE(&scb2[0], key); + OPX_HFI1_BAR_STORE(&scb2[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[7], OPX_JKR_16B_PAD_QWORD); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return FI_SUCCESS; } void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint8_t origin_rx) + const union opx_hfi1_packet_hdr *const hdr, + const uint8_t origin_rx, + const uint32_t slid, + const enum opx_hfi1_type hfi1_type) { OPX_TRACER_TRACE_RELI(OPX_TRACER_BEGIN, "RX_RELI_SEND_PRE_ACKS"); - const uint64_t slid = hdr->stl.lrh.slid; const union fi_opx_reliability_service_flow_key key = { .slid = slid, @@ -688,7 +764,8 @@ void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t rc = fi_opx_hfi1_tx_reliability_inject(ep, (uint64_t)key.value, slid, origin_rx, psn_start, psn_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_ACKS_SENT, key.value, psn_start, psn_count); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_SUCCESS, "RX_RELI_SEND_PRE_ACKS"); } @@ -720,7 +797,7 @@ ssize_t fi_opx_hfi1_rx_reliability_ping_response (struct fid_ep *ep, key, slid, rx, psn_start_24, psn_count_24, - opcode); + opcode, OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? ACKS_SENT : NACKS_SENT, key, psn_start_24, psn_count_24); @@ -764,7 +841,8 @@ void fi_opx_hfi1_rx_reliability_ping (struct fid_ep *ep, key, slid, rx, 0, /* psn_start */ 1, /* psn_count */ - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, NACKS_SENT, key, 0, 1); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_ERROR, "RX_RELI_PING"); return; @@ -987,14 +1065,14 @@ void fi_opx_hfi1_reliability_iov_payload_check( "orig_payload[%d]=%016lX current[@%p]=%016lX\n", getpid(), file, func, line, key, - FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay)), replay->sdma_we, bounce_buf, use_bounce_buf, pending_bounce_buf, we_cc, we_cc_byte_counter, replay->cc_ptr, replay->cc_ptr->byte_counter, replay->cc_dec, cc_next, cc_next_byte_counter, - replay->scb.hdr.stl.bth.opcode, - replay->scb.hdr.dput.target.opcode, + OPX_REPLAY_HDR(replay)->bth.opcode, + OPX_REPLAY_HDR(replay)->dput.target.opcode, replay->iov->iov_base, replay->iov->iov_len, error_msg, i, replay->orig_payload[i], @@ -1063,8 +1141,9 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, * q doesn't contain a rollover (i.e, the tail's PSN >= the head's PSN) * we can just retire all elements in the queue */ - uint32_t head_psn = FI_OPX_HFI1_PACKET_PSN(&head->scb.hdr); - uint32_t tail_psn = FI_OPX_HFI1_PACKET_PSN(&tail->scb.hdr); + + uint32_t head_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head)); + uint32_t tail_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tail)); if ((head_psn >= psn_start) && (tail_psn <= psn_stop) && (tail_psn >= head_psn)) { #ifdef OPX_RELIABILITY_DEBUG @@ -1089,7 +1168,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, last_ack_index += snprintf(&last_ack[last_ack_index], LAST_ACK_LEN-last_ack_index, "(tx) packet %016lx %08x retired (fast path).\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif next = tmp->next; @@ -1108,8 +1187,17 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } } - const uint16_t lrh_pktlen_le = ntohs(tmp->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(tmp->scb.scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + } else { + lrh_pktlen_le = tmp->scb.scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; assert((int32_t)tmp->psn_ptr->psn.bytes_outstanding >= 0); @@ -1120,7 +1208,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } else { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u ACK'd but pinned, marking as ACK'd and skipping free of replay.\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif tmp->acked = true; } @@ -1144,7 +1232,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, uint32_t start_psn = head_psn; while ((start_psn < psn_start) && (start != tail)) { start = start->next; - start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); } if (OFI_UNLIKELY(start_psn < psn_start)) { @@ -1170,12 +1258,12 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, struct fi_opx_reliability_tx_replay * stop = start; uint32_t stop_psn = start_psn; - uint32_t stop_next_psn = FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr); + uint32_t stop_next_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)); while ((stop->next != head) && (stop_next_psn <= psn_stop) && (stop_next_psn > psn_start)) { stop_psn = stop_next_psn; stop = stop->next; - stop_next_psn = FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr); + stop_next_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)); } if (OFI_UNLIKELY(stop_psn > psn_stop)) { @@ -1212,9 +1300,9 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, if (last_ack_index < LAST_ACK_LEN) last_ack_index+=snprintf(&last_ack[last_ack_index],LAST_ACK_LEN-last_ack_index, "(tx) Start = %x, Stop = %x, Halt = %x\n", - FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&halt->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(halt))); #endif /* remove the psn range to ack from the queue */ @@ -1232,7 +1320,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, if (last_ack_index < LAST_ACK_LEN) last_ack_index+=snprintf(&last_ack[last_ack_index],LAST_ACK_LEN-last_ack_index, "(tx) packet %016lx %08x retired (slow path).\n", key, - FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif struct fi_opx_reliability_tx_replay * next = tmp->next; @@ -1251,8 +1339,16 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } } - const uint16_t lrh_pktlen_le = ntohs(tmp->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(tmp->scb.scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + } else { + lrh_pktlen_le = tmp->scb.scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; assert((int32_t)tmp->psn_ptr->psn.bytes_outstanding >= 0); @@ -1263,7 +1359,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } else { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u ACK'd but pinned, marking as ACK'd and skipping free of replay.\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif tmp->acked = true; } @@ -1290,12 +1386,12 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr if (OFI_UNLIKELY(we->comp_state == OPX_SDMA_COMP_ERROR)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Failed sending replay with PSN %u (%X) via SDMA\n", - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr), - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr)); + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay)), + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay))); #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) replay packet %016lx %08u failed sending via SDMA.\n", params->flow_key, - FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay))); #endif } @@ -1310,7 +1406,7 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr fprintf(stderr, "(tx) packet %016lx %08u replay over SDMA complete and ACK'd, freeing replay\n", params->flow_key, - (uint32_t)we->replay->scb.hdr.reliability.psn); + (uint32_t)(OPX_REPLAY_HDR(we->replay)->reliability.psn)); #endif fi_opx_reliability_client_replay_deallocate(&opx_ep->reliability->state, we->replay); @@ -1319,7 +1415,7 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr fprintf(stderr, "(tx) packet %016lx %08u replay over SDMA complete, un-pinning replay\n", params->flow_key, - (uint32_t)we->replay->scb.hdr.reliability.psn); + (uint32_t)(OPX_REPLAY_HDR(we->replay)->reliability.psn)); #endif } slist_remove_head(¶ms->sdma_reqs); @@ -1362,11 +1458,18 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; - key.slid = (uint32_t)start_replay->scb.hdr.stl.lrh.slid; - key.tx = (uint32_t)start_replay->scb.hdr.reliability.origin_tx; - key.dlid = (uint32_t)start_replay->scb.hdr.stl.lrh.dlid; - key.rx = (uint32_t)start_replay->scb.hdr.stl.bth.rx; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + key.slid = (uint32_t)start_replay->scb.scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)start_replay->scb.scb_9B.hdr.lrh_9B.dlid; + } + else { + key.slid = htons(start_replay->scb.scb_16B.hdr.lrh_16B.slid20 << 20 | start_replay->scb.scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(start_replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | start_replay->scb.scb_16B.hdr.lrh_16B.dlid); + } + key.tx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->reliability.origin_tx); + key.rx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->bth.rx); #endif + uint32_t replayed = 0; #ifdef OPX_RELIABILITY_DEBUG @@ -1413,7 +1516,7 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u size %ld bytes replay injected over SDMA\n", - key.value, (uint32_t) replay->scb.hdr.reliability.psn, + key.value, (uint32_t) (OPX_REPLAY_HDR(replay)->reliability.psn), payload_size); #endif replay->pinned = true; @@ -1445,25 +1548,57 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; - key.slid = (uint32_t)replay->scb.hdr.stl.lrh.slid; - key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr); - key.dlid = (uint32_t)replay->scb.hdr.stl.lrh.dlid; - key.rx = (uint32_t)replay->scb.hdr.stl.bth.rx; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + key.slid = (uint32_t)replay->scb.scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)replay->scb.scb_9B.hdr.lrh_9B.dlid; + } else { + key.slid = htons(replay->scb.scb_16B.hdr.lrh_16B.slid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); + } + key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR(replay)); + key.rx = (uint32_t)(OPX_REPLAY_HDR(replay)->bth.rx); #endif + /* runtime checks for non-inlined functions */ + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; + /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; - uint16_t payload_credits_needed = - (payload_bytes_to_copy >> 6) + /* number of full 64-byte blocks of payload */ - ((payload_bytes_to_copy & 0x000000000000003Ful) != 0); /* number of partial 64-byte blocks of payload */ + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; /* payload without (16B) icrc tail */ + uint16_t payload_credits_needed; - union fi_opx_hfi1_pio_state pio_state = *service->tx.hfi1.pio_state; + uint32_t payload_qw_to_copy_with_header = 0; + bool tail_block_needed = false; /* 16B tail needed */ + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload */ + } else { + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; + total_bytes_to_copy = (lrh_pktlen_le) * 8; /* including trailing icrc */ + /* do not copy icrc, it is "pad" not user data */ + payload_bytes_to_copy = (total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B) - 8 /* icrc */); + payload_qw_to_copy_with_header = MIN((7*8), payload_bytes_to_copy)>>3; /* up to 7 qwords */ + assert(payload_bytes_to_copy >= payload_qw_to_copy_with_header * 8); + payload_bytes_to_copy -= payload_qw_to_copy_with_header<<3; + payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload - icrc */ + if (payload_qw_to_copy_with_header >= 7) { /* if tail is not in with hdr/payload block */ + tail_block_needed = true; /* tail needed even if there's no partial payload block*/ + } + } + union fi_opx_hfi1_pio_state pio_state = *service->tx.hfi1.pio_state; FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); - const uint16_t total_credits_needed = payload_credits_needed + 1; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + const uint16_t last_partial_block = (((payload_bytes_to_copy & 0x3Ful) || tail_block_needed) ? 1 : 0); + const uint16_t total_credits_needed = credits_needed + /* header */ + payload_credits_needed + /* full payload blocks */ + last_partial_block ; /* last partial block */ uint16_t total_credits_available = FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state); if (total_credits_available < total_credits_needed) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); @@ -1471,38 +1606,17 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service if (total_credits_available < total_credits_needed) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u Couldn't do replay (no credits)\n", - key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); + key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay))); #endif service->tx.hfi1.pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u replay injected\n", - key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); -#endif - - volatile uint64_t * const scb = - FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_sop_first, pio_state); - - OPX_HFI1_BAR_STORE(&scb[0], replay->scb.qw0); - OPX_HFI1_BAR_STORE(&scb[1], replay->scb.hdr.qw[0]); - OPX_HFI1_BAR_STORE(&scb[2], replay->scb.hdr.qw[1]); - OPX_HFI1_BAR_STORE(&scb[3], replay->scb.hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], replay->scb.hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], replay->scb.hdr.qw[4]); - OPX_HFI1_BAR_STORE(&scb[6], replay->scb.hdr.qw[5]); - OPX_HFI1_BAR_STORE(&scb[7], replay->scb.hdr.qw[6]); - - - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); - - /* consume one credit for the packet header */ - --total_credits_available; - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); -#ifndef NDEBUG - unsigned consumed_credits = 1; + key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay))); #endif uint64_t * buf_qws; @@ -1536,9 +1650,75 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } else { buf_qws = replay->payload; } +#ifndef NDEBUG + unsigned consumed_credits = 0; +#endif + + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_sop_first, pio_state); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + OPX_HFI1_BAR_STORE(&scb[0], replay->scb.scb_9B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb.scb_9B.hdr.qw_9B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb.scb_9B.hdr.qw_9B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb.scb_9B.hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb.scb_9B.hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb.scb_9B.hdr.qw_9B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb.scb_9B.hdr.qw_9B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb.scb_9B.hdr.qw_9B[6]); + + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the packet header */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits = 1; +#endif + } else { + OPX_HFI1_BAR_STORE(&scb[0], replay->scb.scb_16B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb.scb_16B.hdr.qw_16B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb.scb_16B.hdr.qw_16B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb.scb_16B.hdr.qw_16B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb.scb_16B.hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb.scb_16B.hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb.scb_16B.hdr.qw_16B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb.scb_16B.hdr.qw_16B[6]); - while (payload_credits_needed > 0) { + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + /* consume one credit for the packet header */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], replay->scb.scb_16B.hdr.qw_16B[7]); // header + + int i; + + for (i = 1; i <= payload_qw_to_copy_with_header ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], *buf_qws); + buf_qws += 1; + } + for (i = payload_qw_to_copy_with_header+1; i <= 7 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); + } + + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the packet header+payload */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits = 2; +#endif + } + /* Copy full blocks of payload */ + while (payload_credits_needed) { volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); @@ -1563,8 +1743,8 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; } payload_credits_needed -= contiguous_full_blocks_to_write; @@ -1572,6 +1752,53 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service FI_OPX_HFI1_CONSUME_CREDITS(pio_state, contiguous_full_blocks_to_write); #ifndef NDEBUG consumed_credits += contiguous_full_blocks_to_write; +#endif + } + /* Store last partial 64-byte block of payload */ + if(last_partial_block != 0) { + + int16_t payload_tail_bytes = (payload_bytes_to_copy & 0x3Ful) ; /* not icrc/pad */ + + /* We have a credit so we don't have to worry about this wrapping on one block */ + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); + + uint16_t i = 0; + for ( ; payload_tail_bytes >= 8; payload_tail_bytes-=8) { + + OPX_HFI1_BAR_STORE(scb_payload, *buf_qws); + scb_payload += 1; + buf_qws += 1; + i++; + } + + /* LRH packets are dword (9B) or qword (16b) aligned */ + assert((payload_tail_bytes == 4) || (payload_tail_bytes == 0)); + if (hfi1_type != OPX_HFI1_JKR) { + if (payload_tail_bytes) { + OPX_HFI1_BAR_STORE(scb_payload, ((*buf_qws))); + scb_payload += 1; + i++; + } + } else { + /* QWORD aligned for 16B */ + assert(payload_tail_bytes == 0); + /* Have not yet stored icrc/pad */ + assert(i < 8); + } + /* Pad out the cacheline/block */ + for (; i <8; i++) { + OPX_HFI1_BAR_STORE(scb_payload, OPX_JKR_16B_PAD_QWORD); + scb_payload += 1; + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the tail partial block payload */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits += 1; #endif } @@ -1579,10 +1806,11 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service assert(consumed_credits == total_credits_needed); #endif - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(service->tx.hfi1.pio_credits_addr); + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); /* save the updated txe state */ service->tx.hfi1.pio_state->qw0 = pio_state.qw0; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(service->tx.hfi1.pio_credits_addr); return FI_SUCCESS; } @@ -1602,7 +1830,7 @@ ssize_t fi_opx_reliability_pio_replay (union fi_opx_reliability_deferred_work *w if (params->replays[i]->acked) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u replay already ACK'd, skipping deferred replay\n", - params->flow_key, FI_OPX_HFI1_PACKET_PSN(¶ms->replays[i]->scb.hdr)); + params->flow_key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(params->replays[i]))); #endif fi_opx_reliability_client_replay_deallocate(&opx_ep->reliability->state, params->replays[i]); params->replays[i] = NULL; @@ -1685,16 +1913,16 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, */ struct fi_opx_reliability_tx_replay * start = head; - uint32_t start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + uint32_t start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); while ((start_psn < psn_start || start->pinned) && (start != tail)) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) flow__ %016lx rcv nack %lu..%lu Looking for start replay, current start->psn == %u, start->pinned == %d\n", key, psn_start, psn_stop, - FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)), start->pinned); #endif start = start->next; - start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); } if (OFI_UNLIKELY(start_psn < psn_start || start_psn > psn_stop || start->pinned)) { @@ -1733,8 +1961,8 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, struct fi_opx_reliability_tx_replay * stop = start; const uint64_t max = (uint64_t) MIN(OPX_RELIABILITY_TX_MAX_REPLAYS,OPX_RELIABILITY_RX_MAX_NACK); while ((stop->next != head) && - (FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr) < FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr)) && - (FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr) <= psn_stop) && + (FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop)) < FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next))) && + (FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)) <= psn_stop) && (replay_count < max)) { // We won't retransmit pinned replays, so don't count those @@ -1753,7 +1981,7 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) flow__ %016lx rcv nack %08lu..%08lu Replaying PSNs %08u - %08u\n", key, psn_start, psn_stop, start_psn, - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr)); + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop))); #endif // Turn on throttling for this flow while we catch up on replays start->psn_ptr->psn.nack_count = 1; @@ -1775,11 +2003,11 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, if (!queing_replays) { #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(OPX_REPLAY_HDR(replay)->bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rzv); } #endif @@ -1819,11 +2047,11 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, } #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(OPX_REPLAY_HDR(replay)->bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rzv); } #endif @@ -1843,10 +2071,16 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RELI_RX_NACK"); } +enum opx_reliability_ping_result { + OPX_RELIABILITY_PING_NO_REPLAYS = -1, + OPX_RELIABILITY_PING_NO_CREDITS, //NO_CREDITS = 0 to make the if statements in ping_remote clean + OPX_RELIABILITY_PING_SENT +}; + __OPX_FORCE_INLINE__ -uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, +ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, struct fi_opx_reliability_service * service, - RbtIterator itr) + RbtIterator itr, uint64_t key_value) { OPX_TRACER_TRACE_RELI(OPX_TRACER_BEGIN, "RELI_SEND_PING"); struct fi_opx_reliability_tx_replay ** value_ptr = @@ -1856,22 +2090,23 @@ uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, if (OFI_UNLIKELY(head == NULL)) { OPX_TRACER_TRACE_RELI(OPX_TRACER_END_ERROR, "RELI_SEND_PING"); - return 0; + return OPX_RELIABILITY_PING_NO_REPLAYS; } - const union fi_opx_reliability_service_flow_key key = { - .slid = (uint32_t)head->scb.hdr.stl.lrh.slid, - .tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(&head->scb.hdr), - .dlid = (uint32_t)head->scb.hdr.stl.lrh.dlid, - .rx = (uint32_t)head->scb.hdr.stl.bth.rx, - }; + uint64_t dlid; + /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = (uint64_t) head->scb.scb_9B.hdr.lrh_9B.dlid; + } else { + dlid = (uint64_t) htons(head->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | head->scb.scb_16B.hdr.lrh_16B.dlid); + } - const uint64_t dlid = (uint64_t)head->scb.hdr.stl.lrh.dlid; const uint64_t rx = (uint64_t)head->target_reliability_rx; // psn_start will always be 24-bit max number here - uint64_t psn_start = FI_OPX_HFI1_PACKET_PSN(&head->scb.hdr); - uint64_t psn_stop = FI_OPX_HFI1_PACKET_PSN(&head->prev->scb.hdr); + uint64_t psn_start = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head)); + uint64_t psn_stop = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head->prev)); + // if the PSN of the tail is less than the PSN of the head, the // PSN has rolled over. In that case, truncate the ping range @@ -1880,26 +2115,35 @@ uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, // Send one ping to cover the entire replay range. ssize_t rc = fi_opx_hfi1_tx_reliability_inject(ep, - key.value, dlid, rx, + key_value, dlid, rx, psn_start, psn_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING, + OPX_HFI1_TYPE); - INC_PING_STAT_COND(rc == FI_SUCCESS, PINGS_SENT, key.value, psn_start, psn_count); + INC_PING_STAT_COND(rc == FI_SUCCESS, PINGS_SENT, key_value, psn_start, psn_count); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_SUCCESS, "RELI_SEND_PING"); - return (rc == FI_SUCCESS) ? 0 : key.value; + + if(rc){ + return OPX_RELIABILITY_PING_NO_CREDITS; + } + + return OPX_RELIABILITY_PING_SENT; } void fi_reliability_service_ping_remote (struct fid_ep *ep, struct fi_opx_reliability_service * service) { - /* for each flow in the rbtree ... */ RbtIterator start_key_itr; RbtIterator itr; - uint64_t fail_key = 0; + uint64_t key_value = 0; + ssize_t rc = OPX_RELIABILITY_PING_SENT; + uint16_t num_pings = 0; + uint16_t max_pings = service->tx.congested_flag ? service->tx.max_congested_pings : service->tx.max_uncongested_pings; + uint64_t start_key = service->tx.ping_start_key; if (start_key) { itr = fi_opx_rbt_find(service->tx.flow, (void*)start_key); @@ -1910,41 +2154,82 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, } /* Loop until we hit the end of the tree, or we fail on a particular ping */ - while (itr && !fail_key) { + while (itr && rc && num_pings < max_pings) { + fi_opx_rbt_key(itr, &key_value); - fail_key = fi_opx_reliability_send_ping(ep, service, itr); + rc = fi_opx_reliability_send_ping(ep, service, itr, key_value); /* advance to the next dlid */ itr = rbtNext(service->tx.flow, itr); + + if(rc == OPX_RELIABILITY_PING_SENT) { + ++num_pings; + } } - /* We failed on a particular ping. Store the failing key to be the first to try next time, and stop */ - if (fail_key) { - service->tx.ping_start_key = fail_key; + /* We ran out of credits on a particular ping. + * Store the failing key to be the first to try next time, + * set the congested flag to limit future pings, and stop */ + if (!rc) { + service->tx.congested_flag = 1; + service->tx.ping_start_key = key_value; + return; + } + + // We sent the max number of pings this round, save the next key and stop + if (num_pings == max_pings) { + if (itr) { + fi_opx_rbt_key(itr, &key_value); + service->tx.ping_start_key = key_value; + return; + } + service->tx.ping_start_key = 0; return; } /* We hit the end of the tree. If there was no starting key, we've iterated through the whole tree and we're done. */ if (!start_key) { + // Unset the congested flag + service->tx.congested_flag = 0; return; } /* Wrap back around from the beginning of the tree and iterate until we've hit the starting key */ itr = rbtBegin(service->tx.flow); - while (itr && itr != start_key_itr && !fail_key) { + while (itr && itr != start_key_itr && rc && num_pings < max_pings) { + fi_opx_rbt_key(itr, &key_value); - fail_key = fi_opx_reliability_send_ping(ep, service, itr); + rc = fi_opx_reliability_send_ping(ep, service, itr, key_value); /* advance to the next dlid */ itr = rbtNext(service->tx.flow, itr); + + if(rc == OPX_RELIABILITY_PING_SENT) { + ++num_pings; + } } - if (fail_key) { - service->tx.ping_start_key = fail_key; - } else { + if (!rc) { + service->tx.congested_flag = 1; + service->tx.ping_start_key = key_value; + return; + } + + if (num_pings == max_pings) { + if(itr){ + fi_opx_rbt_key(itr, &key_value); + service->tx.ping_start_key = key_value; + return; + } service->tx.ping_start_key = 0; + return; } + + service->tx.ping_start_key = 0; + + // We iterated through the whole tree, unset the congested flag + service->tx.congested_flag = 0; } void fi_opx_reliability_service_process_pending (struct fi_opx_reliability_service * service) @@ -2124,62 +2409,105 @@ void * pthread_start_routine (void * arg) { } #endif -uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * service, - uuid_t unique_job_key, - struct fi_opx_hfi1_context * hfi1, - const enum ofi_reliability_kind reliability_kind) + +void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * service, + struct fi_opx_hfi1_context * hfi1) { - uint8_t origin_reliability_rx = (uint8_t)-1; + /* Ping model */ + { + /* PBC */ + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 ; /* ICRC/tail */ - if (OFI_RELIABILITY_KIND_OFFLOAD == reliability_kind) { - assert (hfi1 == NULL); + /* Setup the 16B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = OPX_HFI1_JKR; - service->reliability_kind = reliability_kind; + service->tx.hfi1.ping_model_16B.qw0 = OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; - /* - * open the hfi1 context, determines JKR or WFR - */ - service->context = fi_opx_hfi1_context_open(NULL, unique_job_key); - FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, - "Opened hfi %p, HFI type %#X/%#X, unit %#X, port %#X, ref_cnt %#lX," - " rcv ctxt %#X, send ctxt %#X, \n", - service->context, service->context->hfi_hfi1_type, OPX_HFI1_TYPE, - service->context->hfi_unit, service->context->hfi_port, - service->context->ref_cnt, - service->context->ctrl->ctxt_info.ctxt, - service->context->ctrl->ctxt_info.send_ctxt); + /* LRH */ + /* (LRH QW) does not include pbc (8 bytes) */ + const uint32_t packetLength = (pbc_dws - 2) * 4; + const uint32_t lrh_qws = (packetLength >> 3) + + ((packetLength & 0x07u) != 0); + + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[0] = 0UL; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[1] = 0UL; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pktlen = lrh_qws; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.sc = hfi1->sc; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.entropy = 0; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.lt = 0; // need to add env variable to change + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.l4 = 9; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pkey = hfi1->pkey; + + + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid = hfi1->lid & 0xFFFFF; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid20 = (hfi1->lid) >> 20; - assert (service->context != NULL); + /* BTH */ + service->tx.hfi1.ping_model_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; + service->tx.hfi1.ping_model_16B.hdr.bth.bth_1 = 0; + service->tx.hfi1.ping_model_16B.hdr.bth.pkey = hfi1->pkey; + service->tx.hfi1.ping_model_16B.hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + service->tx.hfi1.ping_model_16B.hdr.bth.qp = hfi1->bthqp; + service->tx.hfi1.ping_model_16B.hdr.bth.unused = 0; + service->tx.hfi1.ping_model_16B.hdr.bth.rx = 0; /* set at runtime */ + service->tx.hfi1.ping_model_16B.hdr.bth.psn = 0; - hfi1 = service->context; - init_hfi1_rxe_state(hfi1, &service->rx.hfi1.state); + /* KDETH */ + service->tx.hfi1.ping_model_16B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; + service->tx.hfi1.ping_model_16B.hdr.kdeth.jkey = hfi1->jkey; + service->tx.hfi1.ping_model_16B.hdr.kdeth.hcrc = 0; + service->tx.hfi1.ping_model_16B.hdr.kdeth.unused = 0; - service->lid_be = (uint32_t)htons(hfi1->lid); + /* reliability service */ + union opx_hfi1_packet_hdr * hdr = &service->tx.hfi1.ping_model_16B.hdr; - /* - * COPY the rx static information from the hfi context structure. - * This is to improve cache layout. - */ - service->rx.hfi1.hdrq.rhf_base = hfi1->info.rxe.hdrq.rhf_base; - service->rx.hfi1.hdrq.head_register = hfi1->info.rxe.hdrq.head_register; - service->rx.hfi1.egrq.base_addr = hfi1->info.rxe.egrq.base_addr; - service->rx.hfi1.egrq.elemsz = hfi1->info.rxe.egrq.elemsz; - service->rx.hfi1.egrq.last_egrbfr_index = 0; - service->rx.hfi1.egrq.head_register = hfi1->info.rxe.egrq.head_register; + hdr->ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING; + hdr->service.origin_reliability_rx = hfi1->info.rxe.id; + hdr->service.range_count = 0; + hdr->service.unused = 0; + hdr->service.psn_count = 0; + hdr->service.psn_start = 0; + hdr->service.key = 0; + } - /* the 'state' fields will change after every tx operation */ - service->tx.hfi1.pio_state = &hfi1->state.pio; + /* 'ack' pio send model */ + { + service->tx.hfi1.ack_model_16B = service->tx.hfi1.ping_model_16B; + service->tx.hfi1.ack_model_16B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; + } - /* the 'info' fields do not change; the values can be safely copied */ - service->tx.hfi1.pio_scb_sop_first = hfi1->info.pio.scb_sop_first; - service->tx.hfi1.pio_scb_first = hfi1->info.pio.scb_first; - service->tx.hfi1.pio_credits_addr = hfi1->info.pio.credits_addr; + /* 'nack' pio send model */ + { + service->tx.hfi1.nack_model_16B = service->tx.hfi1.ping_model_16B; + service->tx.hfi1.nack_model_16B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; + } +} - origin_reliability_rx = hfi1->info.rxe.id; +uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * service, + uuid_t unique_job_key, + struct fi_opx_hfi1_context * hfi1, + const enum ofi_reliability_kind reliability_kind) +{ + uint8_t origin_reliability_rx = (uint8_t)-1; - } else if (OFI_RELIABILITY_KIND_ONLOAD == reliability_kind) { + if (OFI_RELIABILITY_KIND_ONLOAD == reliability_kind) { assert(hfi1 != NULL); @@ -2223,42 +2551,44 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - service->tx.hfi1.ping_model.qw0 = OPX_PBC_LEN(pbc_dws) | - OPX_PBC_VL(hfi1->vl) | - OPX_PBC_SC(hfi1->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi1->hfi_port) | - OPX_PBC_SCTXT(hfi1->send_ctxt); + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; + + service->tx.hfi1.ping_model_9B.qw0 = OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type); /* LRH */ - service->tx.hfi1.ping_model.hdr.stl.lrh.flags = + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi1->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi1->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - service->tx.hfi1.ping_model.hdr.stl.lrh.dlid = 0; /* set at runtime */ - service->tx.hfi1.ping_model.hdr.stl.lrh.pktlen = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - service->tx.hfi1.ping_model.hdr.stl.lrh.slid = htons(hfi1->lid); + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.dlid = 0; /* set at runtime */ + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.pktlen = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.slid = htons(hfi1->lid); /* BTH */ - service->tx.hfi1.ping_model.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; - service->tx.hfi1.ping_model.hdr.stl.bth.bth_1 = 0; - service->tx.hfi1.ping_model.hdr.stl.bth.pkey = htons(hfi1->pkey); - service->tx.hfi1.ping_model.hdr.stl.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - service->tx.hfi1.ping_model.hdr.stl.bth.qp = hfi1->bthqp; - service->tx.hfi1.ping_model.hdr.stl.bth.unused = 0; - service->tx.hfi1.ping_model.hdr.stl.bth.rx = 0; /* set at runtime */ + service->tx.hfi1.ping_model_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; + service->tx.hfi1.ping_model_9B.hdr.bth.bth_1 = 0; + service->tx.hfi1.ping_model_9B.hdr.bth.pkey = htons(hfi1->pkey); + service->tx.hfi1.ping_model_9B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + service->tx.hfi1.ping_model_9B.hdr.bth.qp = hfi1->bthqp; + service->tx.hfi1.ping_model_9B.hdr.bth.unused = 0; + service->tx.hfi1.ping_model_9B.hdr.bth.rx = 0; /* set at runtime */ /* KDETH */ - service->tx.hfi1.ping_model.hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; - service->tx.hfi1.ping_model.hdr.stl.kdeth.jkey = hfi1->jkey; - service->tx.hfi1.ping_model.hdr.stl.kdeth.hcrc = 0; - service->tx.hfi1.ping_model.hdr.stl.kdeth.unused = 0; + service->tx.hfi1.ping_model_9B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; + service->tx.hfi1.ping_model_9B.hdr.kdeth.jkey = hfi1->jkey; + service->tx.hfi1.ping_model_9B.hdr.kdeth.hcrc = 0; + service->tx.hfi1.ping_model_9B.hdr.kdeth.unused = 0; /* reliability service */ - union fi_opx_hfi1_packet_hdr * hdr = - (union fi_opx_hfi1_packet_hdr *)&service->tx.hfi1.ping_model.hdr; + union opx_hfi1_packet_hdr * hdr = &service->tx.hfi1.ping_model_9B.hdr; hdr->ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING; @@ -2272,14 +2602,14 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser /* 'ack' pio send model */ { - service->tx.hfi1.ack_model = service->tx.hfi1.ping_model; - service->tx.hfi1.ack_model.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; + service->tx.hfi1.ack_model_9B = service->tx.hfi1.ping_model_9B; + service->tx.hfi1.ack_model_9B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; } /* 'nack' pio send model */ { - service->tx.hfi1.nack_model = service->tx.hfi1.ping_model; - service->tx.hfi1.nack_model.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; + service->tx.hfi1.nack_model_9B = service->tx.hfi1.ping_model_9B; + service->tx.hfi1.nack_model_9B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; } @@ -2354,6 +2684,48 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser service->usec_next = fi_opx_timer_next_event_usec(&service->tx.timer, &service->tx.timestamp, service->usec_max); + /* + * Initialize send ping flag(s) + * + * ONLOAD only + */ + service->tx.congested_flag = 0; + + /* + * Maximum number of reliability pings per timer in congested/uncongested scenarios + * + * OFFLOAD and ONLOAD + */ + int max_uncongested_pings; + if(fi_param_get_int(fi_opx_global.prov, "reliability_max_uncongested_pings", &max_uncongested_pings) == FI_SUCCESS) { + if (max_uncongested_pings < OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN || max_uncongested_pings > OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX) { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS has value %d which is outside the valid range of %d-%d. Using default rate of %d\n", max_uncongested_pings, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + max_uncongested_pings = OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT; + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Using environment-specified FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS of %d\n", max_uncongested_pings); + } + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS not specified, using default value of %d\n", OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + max_uncongested_pings = OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT; + } + service->tx.max_uncongested_pings = max_uncongested_pings; + + int max_congested_pings; + if(fi_param_get_int(fi_opx_global.prov, "reliability_max_congested_pings", &max_congested_pings) == FI_SUCCESS) { + if (max_congested_pings < OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN || max_congested_pings > OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX) { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS has value %d which is outside the valid range of %d-%d. Using default rate of %d\n", max_congested_pings, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); + max_congested_pings = OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT; + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Using environment-specified FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS of %d\n", max_congested_pings); + } + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS not specified, using default value of %d\n", OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); + max_congested_pings = OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT; + } + service->tx.max_congested_pings = max_congested_pings; + /* * Maximum number of commands to process from atomic fifo before * stopping to do something else @@ -2612,7 +2984,7 @@ void fi_opx_reliability_client_init (struct fi_opx_reliability_client_state * st const uint8_t rx, const uint8_t tx, void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_reliability_rx)) { @@ -2733,14 +3105,30 @@ void fi_opx_reliability_client_fini (struct fi_opx_reliability_client_state * st __OPX_FORCE_INLINE__ struct fi_opx_reliability_rx_uepkt *fi_opx_reliability_allocate_uepkt(struct fi_opx_reliability_service *service, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes_to_copy) { struct fi_opx_reliability_rx_uepkt * tmp = ofi_buf_alloc(service->uepkt_pool); assert(tmp); - - memcpy((void*)&tmp->hdr, hdr, sizeof(union fi_opx_hfi1_packet_hdr)); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + tmp->hdr.qw_9B[0] = hdr->qw_9B[0]; + tmp->hdr.qw_9B[1] = hdr->qw_9B[1]; + tmp->hdr.qw_9B[2] = hdr->qw_9B[2]; + tmp->hdr.qw_9B[3] = hdr->qw_9B[3]; + tmp->hdr.qw_9B[4] = hdr->qw_9B[4]; + tmp->hdr.qw_9B[5] = hdr->qw_9B[5]; + tmp->hdr.qw_9B[6] = hdr->qw_9B[6]; + } else { + tmp->hdr.qw_16B[0] = hdr->qw_16B[0]; + tmp->hdr.qw_16B[1] = hdr->qw_16B[1]; + tmp->hdr.qw_16B[2] = hdr->qw_16B[2]; + tmp->hdr.qw_16B[3] = hdr->qw_16B[3]; + tmp->hdr.qw_16B[4] = hdr->qw_16B[4]; + tmp->hdr.qw_16B[5] = hdr->qw_16B[5]; + tmp->hdr.qw_16B[6] = hdr->qw_16B[6]; + tmp->hdr.qw_16B[7] = hdr->qw_16B[7]; + } if (payload && payload_bytes_to_copy > 0) memcpy((void*)&tmp->payload[0], (const void *)payload, payload_bytes_to_copy); @@ -2750,12 +3138,22 @@ struct fi_opx_reliability_rx_uepkt *fi_opx_reliability_allocate_uepkt(struct fi_ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * state, uint64_t slid, uint64_t origin_tx, uint32_t psn, - struct fid_ep *ep, const union fi_opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload) + struct fid_ep *ep, const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, + const uint16_t pktlen, const enum opx_hfi1_type hfi1_type) { /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy, payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } union fi_opx_reliability_service_flow_key key; key.slid = slid; @@ -2796,7 +3194,7 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s state->rx, psn - state->service->preemptive_ack_rate + 1, /* psn_start */ state->service->preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } next_psn += 1; @@ -2825,7 +3223,7 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s state->rx, psn - state->service->preemptive_ack_rate + 1, /* psn_start */ state->service->preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } ++next_psn; @@ -2881,7 +3279,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, psn, /* psn_start */ 1, /* psn_count */ - FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_ACKS_SENT, key.value, psn, 1); return; @@ -2919,7 +3318,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, next_psn, nack_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_NACKS_SENT, key.value, next_psn, nack_count); #endif #ifdef OPX_RELIABILITY_DEBUG @@ -2980,7 +3380,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, nack_start_psn, nack_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_NACKS_SENT, key.value, next_psn, nack_count); } @@ -3172,7 +3573,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, * The rank_inst field has been depricated and will be phased out. * The value is always zero. */ - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, hfi1_unit, u8_reliability_rx, &pos, true, u32_reliability_rx, 0, &rc); @@ -3181,22 +3582,32 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = u8_reliability_rx << 56; - struct fi_opx_hfi1_txe_scb model = opx_ep->reliability->service.tx.hfi1.ping_model; - model.hdr.ud.opcode = opcode; - - hdr->qw[0] = model.hdr.qw[0] | lrh_dlid; - - hdr->qw[1] = model.hdr.qw[1] | bth_rx; - - hdr->qw[2] = model.hdr.qw[2]; - - hdr->qw[3] = model.hdr.qw[3]; - - hdr->qw[4] = model.hdr.qw[4]; - - hdr->qw[5] = model.hdr.qw[5]; - // hdr->qw[6] - hdr->service.key = key; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + + struct fi_opx_hfi1_txe_scb_9B model = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + model.hdr.ud.opcode = opcode; + hdr->qw_9B[0] = model.hdr.qw_9B[0] | lrh_dlid; + hdr->qw_9B[1] = model.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = model.hdr.qw_9B[2]; + hdr->qw_9B[3] = model.hdr.qw_9B[3]; + hdr->qw_9B[4] = model.hdr.qw_9B[4]; + hdr->qw_9B[5] = model.hdr.qw_9B[5]; + // hdr->qw[6] + hdr->service.key = key; + } else { + struct fi_opx_hfi1_txe_scb_16B model = opx_ep->reliability->service.tx.hfi1.ping_model_16B; + model.hdr.ud.opcode = opcode; + + hdr->qw_16B[0] = model.hdr.qw_16B[0] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B); + hdr->qw_16B[1] = model.hdr.qw_16B[1] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B); + hdr->qw_16B[2] = model.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = model.hdr.qw_16B[3]; + hdr->qw_16B[4] = model.hdr.qw_16B[4]; + hdr->qw_16B[5] = model.hdr.qw_16B[5]; + hdr->qw_16B[6] = model.hdr.qw_16B[6]; + hdr->service.key = key; /* qw[7] */ + } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -3297,7 +3708,7 @@ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, do { #ifdef OPX_RELIABILITY_DEBUG - fprintf(stderr, "(tx) packet %016lx %08u retired.\n", tx_key.value, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + fprintf(stderr, "(tx) packet %016lx %08u retired.\n", tx_key.value, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif next = tmp->next; @@ -3324,7 +3735,7 @@ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, uint32_t origin_reliability_rx, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); struct fi_opx_reliability_client_state * state = &opx_ep->reliability->state; @@ -3348,7 +3759,7 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, * Reset all SHM related reliability protocol data retained by this * Server EP about the remote Client EP. */ - if (fi_opx_hfi_is_intranode(rx_key.slid)) { + if (opx_lid_is_intranode(rx_key.slid)) { /* Record completion of the resynch request for the remote Client EP */ opx_ep->rx->shm.resynch_connection[origin_reliability_rx].completed = true; opx_ep->rx->shm.resynch_connection[origin_reliability_rx].counter++; @@ -3479,14 +3890,15 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_reliability_service_flow_key rx_key = { .value = hdr->service.key }; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FLOW KEY slid %x/%x dlid %x/%x, key.value %#lx\n",rx_key.slid,ntohs(rx_key.slid),rx_key.dlid,ntohs(rx_key.dlid), rx_key.value); #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(rx) %s Client flow__ %016lx rcv resynch ack\n", - (fi_opx_hfi_is_intranode(rx_key.dlid)) ? "SHM -" : "", + (opx_lid_is_intranode(rx_key.dlid)) ? "SHM -" : "", rx_key.value); #endif @@ -3507,7 +3919,7 @@ void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG else { fprintf(stderr, "Warning, (rx) %s Client flow__ %016lx rcv resynch ack; not found.\n", - (fi_opx_hfi_is_intranode(rx_key.dlid)) ? "SHM -" : "", + (opx_lid_is_intranode(rx_key.dlid)) ? "SHM -" : "", rx_key.value); } #endif @@ -3522,9 +3934,17 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); ssize_t rc = FI_SUCCESS; bool inject_done = false; + uint32_t slid; + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = opx_ep->tx->send_9B.hdr.lrh_9B.slid; + } else { + slid = ntohs(opx_ep->tx->send_9B.hdr.lrh_16B.slid20 << 20 | opx_ep->tx->send_9B.hdr.lrh_16B.slid); + } union fi_opx_reliability_service_flow_key tx_key = { - .slid = opx_ep->tx->send.hdr.stl.lrh.slid, - .tx = opx_ep->tx->send.hdr.reliability.origin_tx, + .slid = slid, + .tx = opx_ep->tx->send_9B.hdr.reliability.origin_tx, .dlid = dest_addr.uid.lid, .rx = dest_addr.hfi1_rx }; @@ -3554,7 +3974,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->hfi->daos_info.rank, - opx_ep->tx->send.hdr.stl.lrh.slid, + slid, dest_addr.uid.lid); } else { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3565,7 +3985,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, "(tx) SHM - Extended address not available\n"); } - if ((opx_ep->tx->send.hdr.stl.lrh.slid == dest_addr.uid.lid) && + if ((slid == dest_addr.uid.lid) && opx_ep->daos_info.rank == opx_ep->hfi->daos_info.rank && opx_ep->daos_info.rank_inst == opx_ep->hfi->daos_info.rank_inst) { /* Nothing to do */ @@ -3731,7 +4151,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, fi_opx_timer_next_event_usec(timer, &start, FI_OPX_TIMER_NEXT_EVENT_USEC_DEFAULT); while (compare < next) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); compare = fi_opx_timer_now(timestamp, timer); if (resynch_flow->remote_ep_resynch_completed) { diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 1dd35647cbb..8be0bb7cdc3 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -38,6 +38,7 @@ #include "rdma/opx/fi_opx_eq.h" #include "rdma/opx/fi_opx.h" #include "rdma/opx/fi_opx_internal.h" +#include "rdma/opx/fi_opx_hfi1_version.h" #include #include @@ -56,40 +57,24 @@ void fi_opx_hit_zero(struct fi_opx_completion_counter *cc) FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== NO COUNTER INCREMENT\n"); } if (cc->cq && cc->context) { - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - opx_context->next = NULL; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->byte_counter = 0; - opx_context->tag = 0; + cc->context->next = NULL; + cc->context->len = 0; + cc->context->buf = NULL; + cc->context->byte_counter = 0; + cc->context->tag = 0; + assert(cc->context->err_entry.op_context != NULL); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== CQ ENQUEUE COMPLETION\n"); - fi_opx_cq_enqueue_completed(cc->cq, cc->context, 0); + fi_opx_cq_enqueue_completed(cc->cq, cc->context, FI_OPX_LOCK_NOT_REQUIRED); } else { + if (cc->context) { + OPX_BUF_FREE(cc->context); + } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== NO CQ COMPLETION\n"); } OPX_BUF_FREE(cc); } -inline int fi_opx_check_rma(struct fi_opx_ep *opx_ep) -{ - if (!opx_ep) - return -FI_EINVAL; - if (opx_ep->state != FI_OPX_EP_INITITALIZED_ENABLED) - return -FI_EINVAL; - - const enum fi_av_type av_type = opx_ep->av->type; - - if (av_type == FI_AV_UNSPEC) - return -FI_EINVAL; - if (av_type == FI_AV_MAP && opx_ep->av->type != FI_AV_MAP) - return -FI_EINVAL; - if (av_type == FI_AV_TABLE && opx_ep->av->type != FI_AV_TABLE) - return -FI_EINVAL; - - return 0; -} - int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_readv_params *params = &work->readv; @@ -109,43 +94,62 @@ int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) uint64_t pos; /* DAOS support - rank_inst field has been depricated and will be phased out. * The value is always zero.*/ - union fi_opx_hfi1_packet_hdr * tx_hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->opx_target_addr.hfi1_unit, + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->opx_target_addr.hfi1_unit, params->dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, 0, &rc); - if (OFI_UNLIKELY(tx_hdr == NULL)) { + if (OFI_UNLIKELY(hdr == NULL)) { return rc; } uint64_t niov = params->niov << 48; uint64_t op64 = params->op << 40; uint64_t dt64 = params->dt << 32; assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type - assert(params->dt == (FI_VOID - 1) || params->dt < OFI_DATATYPE_LAST); - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | params->lrh_dlid | (params->lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | params->bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | params->opcode | dt64 | op64 | niov; - tx_hdr->qw[5] = (uintptr_t)params->rma_request; - tx_hdr->qw[6] = params->key; + assert(params->dt == (FI_VOID - 1) || params->dt < FI_DATATYPE_LAST); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov; + hdr->qw_9B[5] = (uintptr_t)params->rma_request; + hdr->qw_9B[6] = params->key; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(params->lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)params->lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | params->bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode | dt64 | op64 | niov; + hdr->qw_16B[6] = (uintptr_t)params->rma_request; + hdr->qw_16B[7] = params->key; + } union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr + 1); + (union fi_opx_hfi1_packet_payload *)(hdr + 1); tx_payload->cts.iov[0] = params->dput_iov; - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); return FI_SUCCESS; } int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "DO_READV"); struct fi_opx_hfi1_rx_readv_params *params = &work->readv; struct fi_opx_ep *opx_ep = params->opx_ep; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, 2); if (OFI_UNLIKELY(credits_available < 2)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "DO_READV"); return -FI_EAGAIN; } @@ -156,76 +160,117 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) const union fi_opx_addr addr = params->opx_target_addr; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, addr.hfi1_rx, - addr.reliability_rx, &psn_ptr, &replay, params->reliability); + addr.reliability_rx, &psn_ptr, &replay, params->reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "DO_READV"); return -FI_EAGAIN; } volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; uint64_t niov = params->niov << 48; uint64_t op64 = params->op << 40; uint64_t dt64 = params->dt << 32; - uint64_t credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + uint64_t credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type - fi_opx_set_scb(scb, tmp, - opx_ep->rx->tx.cts.qw0 | OPX_PBC_LEN(params->pbc_dws) | credit_return | - params->pbc_dlid, - opx_ep->rx->tx.cts.hdr.qw[0] | params->lrh_dlid | (params->lrh_dws << 32), - opx_ep->rx->tx.cts.hdr.qw[1] | params->bth_rx, - opx_ep->rx->tx.cts.hdr.qw[2] | psn, - opx_ep->rx->tx.cts.hdr.qw[3], - opx_ep->rx->tx.cts.hdr.qw[4] | params->opcode | dt64 | op64 | niov, - (uintptr_t)params->rma_request, - params->key); // key - - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - - replay->scb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; - - /* write the CTS payload "send control block" */ - volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - - fi_opx_set_scb(scb_payload, tmp, - params->dput_iov.qw[0], - params->dput_iov.qw[1], - params->dput_iov.qw[2], - params->dput_iov.qw[3], - params->dput_iov.qw[4], - params->dput_iov.qw[5], - 0, 0); - - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - replay->payload[0] = tmp[0]; - replay->payload[1] = tmp[1]; - replay->payload[2] = tmp[2]; - replay->payload[3] = tmp[3]; - replay->payload[4] = tmp[4]; - replay->payload[5] = tmp[5]; - replay->payload[6] = tmp[6]; - replay->payload[7] = tmp[7]; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | credit_return | + params->pbc_dlid, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32), + opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], + opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov, + (uintptr_t)params->rma_request, + params->key); // key + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); + + /* write the CTS payload "send control block" */ + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, + params->dput_iov.qw[0], + params->dput_iov.qw[1], + params->dput_iov.qw[2], + params->dput_iov.qw[3], + params->dput_iov.qw[4], + params->dput_iov.qw[5], + 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + replay->payload[0] = temp[0]; + replay->payload[1] = temp[1]; + replay->payload[2] = temp[2]; + replay->payload[3] = temp[3]; + replay->payload[4] = temp[4]; + replay->payload[5] = temp[5]; + replay->payload[6] = temp[6]; + replay->payload[7] = temp[7]; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(params->lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->rx->tx.cts_16B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | + credit_return | params->pbc_dlid, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)params->lrh_dws << 20), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | params->bth_rx, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[4], + opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode | dt64 | op64 | niov, + (uintptr_t)params->rma_request); + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + uint64_t temp[16] = {0}; + fi_opx_store_and_copy_qw(scb_payload, temp, + params->key, + params->dput_iov.qw[0], + params->dput_iov.qw[1], + params->dput_iov.qw[2], + params->dput_iov.qw[3], + params->dput_iov.qw[4], + params->dput_iov.qw[5], + 0UL); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + local_temp[8] = temp[0]; + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); + + replay->payload[0] = temp[1]; + replay->payload[1] = temp[2]; + replay->payload[2] = temp[3]; + replay->payload[3] = temp[4]; + replay->payload[4] = temp[5]; + replay->payload[5] = temp[6]; + replay->payload[6] = temp[7]; + } fi_opx_reliability_client_replay_register_no_update( &opx_ep->reliability->state, - params->opx_target_addr.uid.lid, params->opx_target_addr.reliability_rx, - params->dest_rx, psn_ptr, replay, params->reliability); - + params->dest_rx, psn_ptr, replay, params->reliability, + OPX_HFI1_TYPE); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "DO_READV"); return FI_SUCCESS; } @@ -234,18 +279,23 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "INJECT_WRITE"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "INJECT_WRITE"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "INJECT_WRITE"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -260,7 +310,8 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "INJECT_WRITE"); return -FI_EAGAIN; } @@ -277,10 +328,12 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, NULL, &iov); fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, - NULL, cc, FI_VOID, FI_NOOP, + cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags | FI_INJECT, - is_hmem, lock_required, caps, reliability); + is_hmem, lock_required, caps, reliability, + hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "INJECT_WRITE"); return 0; } @@ -288,13 +341,14 @@ inline ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, s fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -303,20 +357,25 @@ inline ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, s __OPX_FORCE_INLINE__ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, - void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, - const enum ofi_reliability_kind reliability) + void *user_context, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITE"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -325,27 +384,39 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dst_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->initial_byte_count = len; cc->byte_counter = len; cc->cntr = opx_ep->write_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if (opx_context && cc->cq) { - opx_context->flags = FI_RMA | FI_WRITE; - } - cc->hit_zero = fi_opx_hit_zero; struct fi_opx_hmem_iov iov; const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, desc, &iov); fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, - (union fi_opx_context *)context, cc, FI_VOID, + cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags, is_hmem, - lock_required, caps, reliability); + lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE"); return 0; } @@ -353,12 +424,13 @@ inline ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t l fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -367,21 +439,26 @@ inline ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t l __OPX_FORCE_INLINE__ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, - uint64_t key, void *context, int lock_required, + uint64_t key, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITEV_INTERNAL"); struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -390,7 +467,23 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dst_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -399,33 +492,33 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void } cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->write_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_WRITE; - cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; for (index = 0; index < count; ++index) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } struct fi_opx_hmem_iov hmem_iov; const uint64_t is_hmem = fi_opx_hmem_iov_init(iov[index].iov_base, iov[index].iov_len, mr_ptr, &hmem_iov); fi_opx_write_internal(opx_ep, &hmem_iov, 1, opx_dst_addr, - addr_offset, key, - (union fi_opx_context *)context, cc, - FI_VOID, FI_NOOP, 0, is_hmem, - lock_required, caps, reliability); + addr_offset, key, cc, FI_VOID, FI_NOOP, + 0, is_hmem, lock_required, caps, + reliability, hfi1_type); addr_offset += iov[index].iov_len; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITEV_INTERNAL"); return 0; } @@ -433,12 +526,13 @@ inline ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_writev_internal(ep, iov, desc, count, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -486,21 +580,26 @@ void fi_opx_get_daos_av_addr_rank(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, int lock_required, - const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + uint64_t flags, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITEMSG_INTERAL"); struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -510,7 +609,23 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,msg->addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_dst_addr); + struct fi_opx_cq *cq = (flags & FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -520,11 +635,8 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->write_cntr; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_WRITE; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; size_t rma_iov_index = 0; @@ -539,19 +651,24 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg uintptr_t msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)msg->desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; struct fi_opx_hmem_iov iov; - uint64_t is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); while (msg_iov_bytes != 0 && rma_iov_bytes != 0) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } + uint64_t is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); size_t len = (msg_iov_bytes <= rma_iov_bytes) ? msg_iov_bytes : rma_iov_bytes; iov.buf = msg_iov_vaddr; iov.len = len; fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, rma_iov_addr, - rma_iov_key, NULL, cc, FI_VOID, FI_NOOP, 0, + rma_iov_key, cc, FI_VOID, FI_NOOP, 0, is_hmem, lock_required, caps, - reliability); + reliability, hfi1_type); msg_iov_bytes -= len; msg_iov_vaddr += len; @@ -560,8 +677,6 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg ++msg_iov_index; msg_iov_bytes = msg->msg_iov[msg_iov_index].iov_len; msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); } @@ -576,18 +691,20 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg } } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITEMSG_INTERNAL"); return 0; } inline ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -596,19 +713,24 @@ inline ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rm __OPX_FORCE_INLINE__ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, - void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + void *user_context, int lock_required, const enum fi_av_type av_type, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READ"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -633,37 +755,51 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des const union fi_opx_addr opx_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,src_addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = len; cc->initial_byte_count = len; cc->cntr = opx_ep->read_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; - cc->hit_zero = fi_opx_hit_zero; fi_opx_readv_internal(opx_ep, &iov, 1, opx_addr, &addr_offset, &key, - (union fi_opx_context *)context, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READ"); return FI_SUCCESS; } inline ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_read_internal(ep, buf, len, desc, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -672,20 +808,25 @@ inline ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, voi __OPX_FORCE_INLINE__ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, - uint64_t key, void *context, int lock_required, + uint64_t key, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READV"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -694,7 +835,13 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,src_addr); - union fi_opx_context *opx_context = (union fi_opx_context *)context; + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); + return -FI_ENOMEM; + } + const uint64_t tx_op_flags = opx_ep->tx->op_flags; uint64_t addr_v[8] = { addr_offset, addr_offset, addr_offset, addr_offset, @@ -702,6 +849,15 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, uint64_t key_v[8] = { key, key, key, key, key, key, key, key }; struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -710,9 +866,8 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, } cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->read_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; cc->hit_zero = fi_opx_hit_zero; uint64_t hmem_device; @@ -721,41 +876,51 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, /* max 8 descriptors (iovecs) per readv_internal */ struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; const size_t full_count = count >> 3; for (index = 0; index < full_count; index += 8) { for (int i = 0; i < 8; ++i) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } hmem_iface = fi_opx_hmem_get_iface(iov[index + i].iov_base, mr_ptr, &hmem_device); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } fi_opx_readv_internal(opx_ep, hmem_iovs, 8, opx_addr, addr_v, key_v, - NULL, 0, NULL, NULL, cc, FI_VOID, FI_NOOP, + 0, NULL, NULL, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); } /* if 'partial_ndesc' is zero, the fi_opx_readv_internal() will fence */ const size_t partial_ndesc = count & 0x07ull; for (int i = 0; i < partial_ndesc; ++i) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } hmem_iface = fi_opx_hmem_get_iface(iov[index + i].iov_base, mr_ptr, &hmem_device); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } - fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, key_v, - opx_context, tx_op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, - FI_VOID, FI_NOOP, - FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability); + fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, + key_v, tx_op_flags, opx_ep->rx->cq, + opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, + FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, + caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV"); return 0; } @@ -763,12 +928,13 @@ inline ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_readv(ep, iov, desc, count, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -776,38 +942,41 @@ inline ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, __OPX_FORCE_INLINE__ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, int lock_required, - const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + uint64_t flags, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READMSG_INTERNAL"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - /* TODO - should this be a different cq than the one used by tsend, etc? */ - struct fi_opx_cq *cq = opx_ep->tx->cq; - if (((cq != NULL) && - ((cq->bflags & FI_SELECTIVE_COMPLETION) && (flags & FI_COMPLETION) == 0))) { - cq = NULL; - } - - union fi_opx_context *opx_context = (union fi_opx_context *)msg->context; - assert(msg->addr != FI_ADDR_UNSPEC); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_src_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,msg->addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_src_addr); + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); + return -FI_ENOMEM; + } + /* for fi_read*(), the 'src' is the remote data */ size_t src_iov_index = 0; const size_t src_iov_count = msg->rma_iov_count; @@ -828,6 +997,15 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, ssize_t index; struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); + return -FI_ENOMEM; + } + memset(cc, 0, sizeof(*cc)); cc->byte_counter = 0; for(index=0; index < msg->iov_count; index++) { @@ -843,17 +1021,20 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, assert(totsize == cc->byte_counter); #endif cc->cntr = opx_ep->read_cntr; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)msg->desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; while (src_iov_index < src_iov_count) { for (niov = 0; niov < 8; ++niov) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } const size_t len = (dst_iov_bytes <= src_iov_bytes) ? dst_iov_bytes : src_iov_bytes; fi_opx_hmem_iov_init(dst_iov_vaddr, len, mr_ptr, &iov[niov]); @@ -882,11 +1063,12 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, #endif fi_opx_readv_internal( opx_ep, iov, niov + 1, opx_src_addr, addr, key, - opx_context, flags, cq, + flags, cq, opx_ep->read_cntr, /* enable_cq, enable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, - reliability); + reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READMSG_INTERNAL"); return 0; } else { @@ -917,8 +1099,6 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, ++dst_iov_index; dst_iov_bytes = msg->msg_iov[dst_iov_index].iov_len; dst_iov_vaddr = msg->msg_iov[dst_iov_index].iov_base; - ++mr_ptr_array; - mr_ptr = (mr_ptr_present) ? *mr_ptr_array : NULL; } } else { dst_iov_vaddr = (void *)((uintptr_t)dst_iov_vaddr + len); @@ -932,14 +1112,16 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, assert(totsize_issued <= totsize); #endif fi_opx_readv_internal(opx_ep, iov, 8, opx_src_addr, addr, key, - NULL, 0, NULL, NULL, /* disable_cq, disable_cntr */ + 0, NULL, NULL, /* disable_cq, disable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); } /* end while */ /* should never get here */ + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); abort(); return 0; @@ -948,12 +1130,13 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, inline ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_readmsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -966,10 +1149,26 @@ static inline ssize_t fi_opx_rma_read(struct fid_ep *ep, void *buf, size_t len, struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -980,9 +1179,22 @@ static inline ssize_t fi_opx_rma_readmsg(struct fid_ep *ep, const struct fi_msg_ struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -994,10 +1206,25 @@ static inline ssize_t fi_opx_rma_inject_write(struct fid_ep *ep, const void *buf struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1009,10 +1236,25 @@ static inline ssize_t fi_opx_rma_write(struct fid_ep *ep, const void *buf, size_ struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1024,10 +1266,26 @@ static inline ssize_t fi_opx_rma_writev(struct fid_ep *ep, const struct iovec *i struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1038,10 +1296,25 @@ static inline ssize_t fi_opx_rma_writemsg(struct fid_ep *ep, const struct fi_msg struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1073,32 +1346,44 @@ int fi_opx_init_rma_ops(struct fid_ep *ep, struct fi_info *info) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-function" -FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) -FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) -#define FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) -#define FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ops_rma_##LOCK##_##AV##_##CAPS##_##RELIABILITY +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) -#define FI_OPX_RMA_OPS_STRUCT(LOCK, AV, CAPS, RELIABILITY) \ - static struct fi_ops_rma FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) = { \ +#define FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) + +#define FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ops_rma_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE + +#define FI_OPX_RMA_OPS_STRUCT(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static struct fi_ops_rma FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) = { \ .size = sizeof(struct fi_ops_rma), \ - .read = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, LOCK, AV, CAPS, RELIABILITY), \ + .read = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .readv = fi_no_rma_readv, \ - .readmsg = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, LOCK, AV, CAPS, RELIABILITY), \ - .write = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, LOCK, AV, CAPS, RELIABILITY), \ + .readmsg = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .write = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .inject = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, LOCK, AV, CAPS, \ - RELIABILITY), \ - .writev = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writev, LOCK, AV, CAPS, RELIABILITY), \ + RELIABILITY, HFI1_TYPE), \ + .writev = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writev, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .writemsg = \ - FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writemsg, LOCK, AV, CAPS, RELIABILITY), \ + FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writemsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .writedata = fi_no_rma_writedata, \ } -FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY); -FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); #pragma GCC diagnostic pop @@ -1129,15 +1414,47 @@ int fi_opx_enable_rma_ops(struct fid_ep *ep) } const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - if (!lock_required) { - opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, - OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, - OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY); - + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } return 0; @@ -1154,31 +1471,92 @@ ssize_t fi_opx_write_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t le fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key, void *context) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, len, desc, dest_addr, - addr_offset, key, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_write_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, len, dest_addr, addr_offset, key); -} + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, len, dest_addr, addr_offset, key); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, len, dest_addr, addr_offset, key); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, len, dest_addr, addr_offset, key); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM;} ssize_t fi_opx_read_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, len, desc, src_addr, - addr_offset, key, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_readmsg_FABRIC_DIRECT(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, msg, flags); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, msg, flags); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_service.c b/prov/opx/src/fi_opx_service.c index 84aa5279c17..30c6341bb4e 100644 --- a/prov/opx/src/fi_opx_service.c +++ b/prov/opx/src/fi_opx_service.c @@ -856,3 +856,64 @@ int opx_hfi_get_hfi1_count() { } return hfi1_count; } + +/** + * @brief Reset the HFI context. + * + * This function resets the HFI context by sending a command to the specified file descriptor. + * The command type is set to OPX_HFI_CMD_CTXT_RESET and the command length and address are set to 0. + * If the command write fails, the function will retry if the error is ENOLCK. + * If the error is not EINVAL, a warning message will be printed. + * + * @param fd The file descriptor to send the command to. + * @return 0 on success, -1 on failure. + */ +int opx_hfi_reset_context(int fd) +{ + struct hfi1_cmd cmd; + + cmd.type = OPX_HFI_CMD_CTXT_RESET; + cmd.len = 0; + cmd.addr = 0; + +retry: + if (opx_hfi_cmd_write(fd, &cmd, sizeof(cmd)) == -1) { + if (errno == ENOLCK) + goto retry; + + if (errno != EINVAL) + _HFI_INFO("reset ctxt failed: %s\n", strerror(errno)); + + return -1; + } + return 0; +} + +/** + * @brief Acknowledge events for the HFI. + * + * This function sends an acknowledgment for events to the HFI. + * + * @param fd The file descriptor for the HFI control. + * @param ackbits The bits to be acknowledged. + * @return 0 on success, -1 on failure. + */ +int opx_hfi_ack_events(int fd, uint64_t ackbits) +{ + struct hfi1_cmd cmd; + + cmd.type = OPX_HFI_CMD_ACK_EVENT; + cmd.len = 0; + cmd.addr = ackbits; + +retry: + if (opx_hfi_cmd_write(fd, &cmd, sizeof(cmd)) == -1) { + if (errno == ENOLCK) + goto retry; + + if (errno != EINVAL) + _HFI_INFO("ack event failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index cb8a4b95c18..d2b03c056cc 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -56,41 +56,46 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, - const enum fi_progress progress) + const enum fi_progress progress, + const enum opx_hfi1_type hfi1_type) { + assert(!lock_required); + assert(!(flags & FI_MULTI_RECV)); /* Multi-receive incompatible with tagged receives */ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - union fi_opx_context * opx_context = NULL; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST TRECVMSG\n"); - assert(!lock_required); - assert(!(flags & FI_MULTI_RECV)); /* Multi-receive incompatible with tagged receives */ - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->next = NULL; + context->src_addr = msg->addr; + context->flags = flags; + context->err_entry.err = 0; + context->err_entry.op_context = msg->context; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.posted_recv_tag); if (msg->iov_count == 0) { - opx_context = (union fi_opx_context *) msg->context; - opx_context->next = NULL; - opx_context->src_addr = msg->addr; - opx_context->flags = flags; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->byte_counter = (uint64_t)-1; + context->len = 0; + context->buf = NULL; + context->byte_counter = (uint64_t)-1; + if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - opx_context->tag = msg->tag; - opx_context->ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } #ifdef OPX_HMEM @@ -114,118 +119,99 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, #endif if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.posted_recv_tag); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - flags |= FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; + flags |= FI_OPX_CQ_CONTEXT_HMEM; - ext->err_entry.err = 0; - ext->opx_context.next = NULL; - ext->opx_context.src_addr = msg->addr; - ext->opx_context.flags = flags; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->msg.op_context = msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->byte_counter = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; if (msg->iov_count == 1) { - ext->opx_context.len = msg->msg_iov[0].iov_len; - ext->opx_context.buf = msg->msg_iov[0].iov_base; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } } else { assert((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM); /* TODO - why not? */ - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } #endif if (msg->iov_count == 1) { - opx_context = (union fi_opx_context *) msg->context; - opx_context->next = NULL; - opx_context->src_addr = msg->addr; - opx_context->flags = flags; - opx_context->len = msg->msg_iov[0].iov_len; - opx_context->buf = msg->msg_iov[0].iov_base; - opx_context->byte_counter = (uint64_t)-1; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->byte_counter = (uint64_t)-1; if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - opx_context->tag = msg->tag; - opx_context->ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } assert((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM); /* TODO - why not? */ - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - flags |= FI_OPX_CQ_CONTEXT_EXT; - - ext->err_entry.err = 0; - ext->opx_context.next = NULL; - ext->opx_context.src_addr = msg->addr; - ext->opx_context.flags = flags; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; - ext->msg.op_context = msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->byte_counter = (uint64_t)-1; + context->tag = msg->tag; + context->ignore = msg->ignore; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } ssize_t fi_opx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->domain->threading; const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); const enum fi_av_type av_type = opx_ep->av_type; + ssize_t rc = 0; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, - opx_ep->reliability->state.kind, - opx_ep->domain->data_progress); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_JKR); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -233,8 +219,6 @@ ssize_t fi_opx_trecvmsg(struct fid_ep *ep, ssize_t fi_opx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -254,8 +238,10 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, msg->addr, msg->tag, msg->data, FI_OPX_LOCK_NOT_REQUIRED, av_type, + flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } else { rc = fi_opx_ep_tx_send_internal(ep, 0, 0, msg->desc, msg->addr, msg->tag, msg->context, msg->data, @@ -265,7 +251,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } } else if (niov == 1) { rc = fi_opx_ep_tx_send_internal(ep, msg->msg_iov->iov_base, @@ -277,7 +264,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } else { rc = fi_opx_ep_tx_send_internal(ep, msg->msg_iov, msg->iov_count, msg->desc, msg->addr, msg->tag, msg->context, msg->data, @@ -287,174 +275,166 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - - - - - -#define FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY) - -#define FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY) \ - fi_opx_ops_tagged_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY - -#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ -static struct fi_ops_tagged \ - FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) __attribute__ ((unused)) = { \ - .size = sizeof(struct fi_ops_tagged), \ - .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY), \ - .recvv = fi_no_tagged_recvv, \ - .recvmsg = fi_opx_trecvmsg, \ - .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY), \ - .sendv = fi_no_tagged_sendv, \ - .sendmsg = fi_opx_tsendmsg, \ - .inject = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, LOCK, AV, CAPS, RELIABILITY), \ - .senddata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, LOCK, AV, CAPS, RELIABILITY), \ - .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY), \ +/* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) + +/* FI_LOCAL_COMM = 0x0008000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) + +/* FI_REMOTE_COMM = 0x0010000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) + +#define FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) + +#define FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + fi_opx_ops_tagged_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE + +#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ +static struct fi_ops_tagged \ + FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) __attribute__ ((unused)) = { \ + .size = sizeof(struct fi_ops_tagged), \ + .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .recvv = fi_no_tagged_recvv, \ + .recvmsg = fi_opx_trecvmsg, \ + .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .sendv = fi_no_tagged_sendv, \ + .sendmsg = fi_opx_tsendmsg, \ + .inject = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .senddata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ } -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - - - - +/* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + +/* FI_LOCAL_COMM = 0x0008000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + +/* FI_REMOTE_COMM = 0x0010000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); ssize_t fi_opx_tsearch(struct fid_ep *ep, uint64_t *tag, uint64_t ignore, uint64_t flags, @@ -499,8 +479,8 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if (!opx_ep || !opx_ep->domain) - goto err; + if (!opx_ep || !opx_ep->domain) + goto err; if (!(opx_ep->tx->caps & FI_TAGGED) || !(opx_ep->rx->caps & FI_TAGGED)) { /* Tagged ops not enabled on this endpoint */ @@ -517,11 +497,17 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) const enum fi_threading threading = opx_ep->domain->threading; if (OFI_UNLIKELY(fi_opx_threading_unknown(threading))) { - opx_ep->ep_fid.tagged = &fi_opx_no_tagged_ops; - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Unknown thread mode, tagged ops not enabled on EP\n"); + opx_ep->ep_fid.tagged = &fi_opx_no_tagged_ops; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Unknown thread mode, tagged ops not enabled on EP\n"); return 0; - } + } + + if (OFI_UNLIKELY(opx_ep->reliability->state.kind != OFI_RELIABILITY_KIND_ONLOAD)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Invalid reliability kind %u\n", opx_ep->reliability->state.kind); + return -FI_EINVAL; + } uint64_t comm_caps = opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); if (comm_caps == 0) @@ -529,111 +515,138 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - if (!lock_required) { - if (opx_ep->av->type == FI_AV_TABLE) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else { - if (opx_ep->av->type == FI_AV_TABLE) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } + } else { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } } } @@ -657,68 +670,183 @@ ssize_t fi_opx_tinject_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) (ep, buf, len, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, dest_addr, tag); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tsend_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR) (ep, buf, len, desc, dest_addr, tag, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tinjectdata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_WFR) (ep, buf, len, data, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, data, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, data, dest_addr, tag); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tsenddata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, data, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, data, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR) (ep, buf, len, desc, data, dest_addr, tag, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_trecv_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, src_addr, tag, ignore, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, src_addr, tag, ignore, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR) (ep, buf, len, desc, src_addr, tag, ignore, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_tid_domain.c b/prov/opx/src/fi_opx_tid_domain.c index 03024c82f50..946edbab2d2 100644 --- a/prov/opx/src/fi_opx_tid_domain.c +++ b/prov/opx/src/fi_opx_tid_domain.c @@ -149,7 +149,11 @@ int opx_close_tid_domain(struct opx_tid_domain *tid_domain, int locked) } dlist_remove(&tid_domain->list_entry); - ofi_domain_close(&tid_domain->util_domain); + int ret = ofi_domain_close(&tid_domain->util_domain); + if (ret != 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error closing domain: %d\n", ret); + } + free(tid_domain); return 0; diff --git a/prov/opx/src/opa_proto.c b/prov/opx/src/opa_proto.c index 235d2c37e57..742eb43721f 100644 --- a/prov/opx/src/opa_proto.c +++ b/prov/opx/src/opa_proto.c @@ -149,22 +149,12 @@ static int map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt) /* 7. Map RXE per-context CSRs */ /* JKR sz is 8K. WFR sz is 4K. */ - if(OPX_HFI1_WFR == opx_hfi1_check_hwversion(binfo->hw_version)){ + if(OPX_HFI1_WFR == opx_hfi1_check_hwversion(binfo->hw_version)) { sz = HFI_MMAP_PGSIZE; -#ifndef OPX_WFR - fprintf(stderr, "Runtime HFI type (%u) found on non-WFR build\n", - opx_hfi1_check_hwversion(binfo->hw_version)); - abort(); -#endif } else { /* JKR prefers 8K page alignment for possible future work with 8K virtual memory pages */ sz = 2*HFI_MMAP_PGSIZE; -#ifndef OPX_JKR - fprintf(stderr, "Runtime HFI type (%u) found on non-JKR build\n", - opx_hfi1_check_hwversion(binfo->hw_version)); - abort(); -#endif } HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ); arrsz[USER_REGBASE] = sz; diff --git a/prov/psm2/src/psmx2_attr.c b/prov/psm2/src/psmx2_attr.c index f7a1f5496d6..920ed918fe8 100644 --- a/prov/psm2/src/psmx2_attr.c +++ b/prov/psm2/src/psmx2_attr.c @@ -335,6 +335,7 @@ void psmx2_update_prov_info(struct fi_info *info, struct psmx2_ep_name *dest_addr) { struct fi_info *p; + int ret; for (p = info; p; p = p->next) { psmx2_dup_addr(p->addr_format, src_addr, @@ -363,10 +364,17 @@ void psmx2_update_prov_info(struct fi_info *info, } free(p->domain_attr->name); - if (unit == PSMX2_DEFAULT_UNIT) + if (unit == PSMX2_DEFAULT_UNIT) { p->domain_attr->name = strdup(psmx2_hfi_info.default_domain_name); - else - asprintf(&p->domain_attr->name, "hfi1_%d", unit); + } else { + ret = asprintf(&p->domain_attr->name, "hfi1_%d", unit); + if (ret < 0) { + p->domain_attr->name = NULL; + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to allocate domain name for HFI unit %d\n", + unit); + } + } p->tx_attr->inject_size = psmx2_env.inject_size; } diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index da0781e654d..b05416c5e07 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -401,12 +401,12 @@ static int psmx3_atomic_do_write(void *dest, void *src, break; case FI_LOR: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LOR); break; case FI_LAND: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LAND); break; @@ -421,7 +421,7 @@ static int psmx3_atomic_do_write(void *dest, void *src, break; case FI_LXOR: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LXOR); break; @@ -601,7 +601,8 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, if (!op_error) { addr += mr->offset; - psmx3_atomic_do_write(addr, src, datatype, op, count); + op_error = psmx3_atomic_do_write(addr, src, datatype, + op, count); if (rx->ep->caps & FI_RMA_EVENT) { cntr = rx->ep->remote_write_cntr; @@ -646,8 +647,8 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) - psmx3_atomic_do_readwrite(addr, src, tmp_buf, - datatype, op, count); + op_error = psmx3_atomic_do_readwrite(addr, src, + tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; @@ -698,9 +699,10 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) - psmx3_atomic_do_compwrite(addr, src, (uint8_t *)src + len, - tmp_buf, datatype, - op, count); + op_error = psmx3_atomic_do_compwrite(addr, src, + (uint8_t *)src + len, + tmp_buf, datatype, + op, count); else op_error = -FI_ENOMEM; @@ -2067,6 +2069,11 @@ static int psmx3_atomic_writevalid_internal(size_t chunk_size, switch (op) { case FI_MIN: case FI_MAX: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX) + return -FI_EOPNOTSUPP; + /* fall through */ case FI_SUM: case FI_PROD: case FI_LOR: @@ -2098,6 +2105,11 @@ static int psmx3_atomic_readwritevalid_internal(size_t chunk_size, switch (op) { case FI_MIN: case FI_MAX: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX) + return -FI_EOPNOTSUPP; + /* fall through */ case FI_SUM: case FI_PROD: case FI_LOR: diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 16074babeac..fa570b455a4 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -183,9 +183,9 @@ do { \ extern struct fi_provider rxm_prov; extern struct util_prov rxm_util_prov; -extern struct fi_ops_msg rxm_msg_ops; +extern struct fi_ops_msg rxm_msg_ops, rxm_no_recv_msg_ops; extern struct fi_ops_msg rxm_msg_thru_ops; -extern struct fi_ops_tagged rxm_tagged_ops; +extern struct fi_ops_tagged rxm_tagged_ops, rxm_no_recv_tagged_ops; extern struct fi_ops_tagged rxm_tagged_thru_ops; extern struct fi_ops_rma rxm_rma_ops; extern struct fi_ops_rma rxm_rma_thru_ops; @@ -265,6 +265,8 @@ struct rxm_fabric { struct rxm_domain { struct util_domain util_domain; struct fid_domain *msg_domain; + struct fid_ep rx_ep; + struct fid_peer_srx *srx; size_t max_atomic_size; size_t rx_post_size; uint64_t mr_key; @@ -417,13 +419,15 @@ struct rxm_pkt { char data[]; }; +enum rxm_sar_seg_type { + RXM_SAR_SEG_FIRST = 1, + RXM_SAR_SEG_MIDDLE = 2, + RXM_SAR_SEG_LAST = 3, +}; + union rxm_sar_ctrl_data { struct { - enum rxm_sar_seg_type { - RXM_SAR_SEG_FIRST = 1, - RXM_SAR_SEG_MIDDLE = 2, - RXM_SAR_SEG_LAST = 3, - } seg_type : 2; + enum rxm_sar_seg_type seg_type : 2; uint32_t offset; }; uint64_t align; @@ -441,24 +445,29 @@ rxm_sar_set_seg_type(struct ofi_ctrl_hdr *ctrl_hdr, enum rxm_sar_seg_type seg_ty ((union rxm_sar_ctrl_data *)&(ctrl_hdr->ctrl_data))->seg_type = seg_type; } -struct rxm_recv_match_attr { - fi_addr_t addr; - uint64_t tag; - uint64_t ignore; -}; - -struct rxm_unexp_msg { - struct dlist_entry entry; - fi_addr_t addr; - uint64_t tag; -}; - struct rxm_iov { struct iovec iov[RXM_IOV_LIMIT]; void *desc[RXM_IOV_LIMIT]; uint8_t count; }; +struct rxm_proto_info { + /* Used for SAR protocol */ + struct { + struct dlist_entry entry; + struct dlist_entry pkt_list; + struct fi_peer_rx_entry *rx_entry; + size_t total_recv_len; + struct rxm_conn *conn; + uint64_t msg_id; + } sar; + /* Used for Rendezvous protocol */ + struct { + /* This is used to send RNDV ACK */ + struct rxm_tx_buf *tx_buf; + } rndv; +}; + struct rxm_buf { /* Must stay at top */ struct fi_context fi_context; @@ -476,9 +485,10 @@ struct rxm_rx_buf { /* MSG EP / shared context to which bufs would be posted to */ struct fid_ep *rx_ep; struct dlist_entry repost_entry; + struct dlist_entry unexp_entry; struct rxm_conn *conn; /* msg ep data was received on */ - struct rxm_recv_entry *recv_entry; - struct rxm_unexp_msg unexp_msg; + struct fi_peer_rx_entry *peer_entry; + struct rxm_proto_info *proto_info; uint64_t comp_flags; struct fi_recv_context recv_context; bool repost; @@ -606,49 +616,6 @@ struct rxm_deferred_tx_entry { }; }; -struct rxm_recv_entry { - struct dlist_entry entry; - struct rxm_iov rxm_iov; - fi_addr_t addr; - void *context; - uint64_t flags; - uint64_t tag; - uint64_t ignore; - uint64_t comp_flags; - size_t total_len; - struct rxm_recv_queue *recv_queue; - - /* Used for SAR protocol */ - struct { - struct dlist_entry entry; - size_t total_recv_len; - struct rxm_conn *conn; - uint64_t msg_id; - } sar; - /* Used for Rendezvous protocol */ - struct { - /* This is used to send RNDV ACK */ - struct rxm_tx_buf *tx_buf; - } rndv; -}; -OFI_DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs); - -enum rxm_recv_queue_type { - RXM_RECV_QUEUE_UNSPEC, - RXM_RECV_QUEUE_MSG, - RXM_RECV_QUEUE_TAGGED, -}; - -struct rxm_recv_queue { - struct rxm_ep *rxm_ep; - enum rxm_recv_queue_type type; - struct rxm_recv_fs *fs; - struct dlist_entry recv_list; - struct dlist_entry unexp_msg_list; - dlist_func_t *match_recv; - dlist_func_t *match_unexp; -}; - struct rxm_eager_ops { void (*comp_tx)(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_eager_buf); @@ -688,6 +655,8 @@ struct rxm_ep { struct fi_ops_transfer_peer *offload_coll_peer_xfer_ops; uint64_t offload_coll_mask; + struct fid_peer_srx *srx; + struct fid_cq *msg_cq; uint64_t msg_cq_last_poll; size_t comp_per_progress; @@ -701,7 +670,6 @@ struct rxm_ep { bool do_progress; bool enable_direct_send; - size_t min_multi_recv_size; size_t buffered_min; size_t buffered_limit; size_t inject_limit; @@ -709,19 +677,18 @@ struct rxm_ep { size_t eager_limit; size_t sar_limit; size_t tx_credit; + size_t min_multi_recv_size; struct ofi_bufpool *rx_pool; struct ofi_bufpool *tx_pool; struct ofi_bufpool *coll_pool; + struct ofi_bufpool *proto_info_pool; + struct rxm_pkt *inject_pkt; struct dlist_entry deferred_queue; struct dlist_entry rndv_wait_list; - struct rxm_recv_queue recv_queue; - struct rxm_recv_queue trecv_queue; - struct ofi_bufpool *multi_recv_pool; - struct rxm_eager_ops *eager_ops; struct rxm_rndv_ops *rndv_ops; }; @@ -755,11 +722,15 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); - -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err); +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err); void rxm_handle_comp_error(struct rxm_ep *rxm_ep); ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp); @@ -878,50 +849,6 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr); -static inline void rxm_cntr_incerr(struct util_cntr *cntr) -{ - if (cntr) - cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1); -} - -static inline void -rxm_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write(cq, context, flags, len, buf, data, tag); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - -static inline void -rxm_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t addr) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write_src(cq, context, flags, len, buf, data, tag, addr); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr, struct rxm_conn **rxm_conn); @@ -956,17 +883,10 @@ ssize_t rxm_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len); -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue); -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore); -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf); +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_post_recv(struct rxm_rx_buf *rx_buf); void rxm_av_remove_handler(struct util_ep *util_ep, struct util_peer_addr *peer); @@ -987,41 +907,6 @@ rxm_free_rx_buf(struct rxm_rx_buf *rx_buf) } } -static inline void -rxm_recv_entry_release(struct rxm_recv_entry *entry) -{ - if (entry->recv_queue) - ofi_freestack_push(entry->recv_queue->fs, entry); - else - ofi_buf_free(entry); -} - -static inline void -rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, uint64_t flags, - size_t len, char *buf) -{ - if (rx_buf->ep->util_coll_peer_xfer_ops && - rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { - struct fi_cq_tagged_entry cqe = { - .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, - }; - rx_buf->ep->util_coll_peer_xfer_ops-> - complete(rx_buf->ep->util_coll_ep, &cqe, 0); - return; - } - - if (rx_buf->ep->rxm_info->caps & FI_SOURCE) - rxm_cq_write_src(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, - rx_buf->conn->peer->fi_addr); - else - rxm_cq_write(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag); -} - struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); struct rxm_recv_entry * diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index defa7771188..6dc1241329e 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -40,7 +40,8 @@ OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ FI_MULTI_RECV) -#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) +#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID | \ + FI_PEER) /* Since we are a layering provider, the attributes for which we rely on the diff --git a/prov/rxm/src/rxm_conn.c b/prov/rxm/src/rxm_conn.c index afe603234ec..73b26f2a9f3 100644 --- a/prov/rxm/src/rxm_conn.c +++ b/prov/rxm/src/rxm_conn.c @@ -58,7 +58,7 @@ struct rxm_eq_cm_entry { static void rxm_close_conn(struct rxm_conn *conn) { struct rxm_deferred_tx_entry *tx_entry; - struct rxm_recv_entry *rx_entry; + struct fi_peer_rx_entry *rx_entry; struct rxm_rx_buf *buf; FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing conn %p\n", conn); @@ -74,16 +74,13 @@ static void rxm_close_conn(struct rxm_conn *conn) while (!dlist_empty(&conn->deferred_sar_segments)) { buf = container_of(conn->deferred_sar_segments.next, - struct rxm_rx_buf, unexp_msg.entry); - dlist_remove(&buf->unexp_msg.entry); - rxm_free_rx_buf(buf); + struct rxm_rx_buf, unexp_entry); + dlist_remove(&buf->unexp_entry); } while (!dlist_empty(&conn->deferred_sar_msgs)) { - rx_entry = container_of(conn->deferred_sar_msgs.next, - struct rxm_recv_entry, sar.entry); - dlist_remove(&rx_entry->entry); - rxm_recv_entry_release(rx_entry); + rx_entry = (struct fi_peer_rx_entry*)conn->deferred_sar_msgs.next; + rx_entry->srx->owner_ops->free_entry(rx_entry); } fi_close(&conn->msg_ep->fid); rxm_flush_msg_cq(conn->ep); diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 27c8cc6f1c0..51206ddde04 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -101,6 +101,36 @@ static void rxm_replace_rx_buf(struct rxm_rx_buf *rx_buf) ofi_buf_free(new_rx_buf); } +static void rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, + uint64_t flags, size_t len, char *buf) +{ + int ret; + + flags &= ~FI_COMPLETION; + if (rx_buf->ep->util_coll_peer_xfer_ops && + rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { + struct fi_cq_tagged_entry cqe = { + .tag = rx_buf->pkt.hdr.tag, + .op_context = rx_buf->peer_entry->context, + }; + rx_buf->ep->util_coll_peer_xfer_ops-> + complete(rx_buf->ep->util_coll_ep, &cqe, 0); + return; + } + if (rx_buf->ep->rxm_info->caps & FI_SOURCE) + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, + rx_buf->conn->peer->fi_addr); + else + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); + if (ret) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to write rx completion\n"); +} + static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) { uint64_t flags; @@ -108,7 +138,7 @@ static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) && rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) { - dlist_insert_tail(&rx_buf->unexp_msg.entry, + dlist_insert_tail(&rx_buf->unexp_entry, &rx_buf->conn->deferred_sar_segments); rxm_replace_rx_buf(rx_buf); } @@ -136,19 +166,20 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) int ret; if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED) - rxm_cntr_incerr(rx_buf->ep->util_ep.cntrs[CNTR_RX]); + ofi_ep_peer_rx_cntr_incerr(&rx_buf->ep->util_ep, ofi_op_msg); FI_WARN(&rxm_prov, FI_LOG_CQ, "Message truncated: " "recv buf length: %zu message length: %" PRIu64 "\n", done_len, rx_buf->pkt.hdr.size); - ret = ofi_cq_write_error_trunc(rx_buf->ep->util_ep.rx_cq, - rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags, - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, - rx_buf->pkt.hdr.size - done_len); + ret = ofi_peer_cq_write_error_trunc( + rx_buf->ep->util_ep.rx_cq, + rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, + rx_buf->pkt.hdr.size, + rx_buf->peer_entry->iov[0].iov_base, + rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, + rx_buf->pkt.hdr.size - done_len); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n"); assert(0); @@ -157,27 +188,22 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) { - struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; - if (done_len < rx_buf->pkt.hdr.size) { rxm_cq_write_error_trunc(rx_buf, done_len); goto release; } - if (rx_buf->recv_entry->flags & FI_COMPLETION || + if (rx_buf->peer_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { - rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags | - (rx_buf->recv_entry->flags & FI_MULTI_RECV), + rxm_cq_write_recv_comp(rx_buf, rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov. - iov[0].iov_base); + rx_buf->peer_entry->iov[0].iov_base); } - ofi_ep_cntr_inc(&rx_buf->ep->util_ep, CNTR_RX); - + ofi_ep_peer_rx_cntr_inc(&rx_buf->ep->util_ep, ofi_op_msg); release: - rxm_recv_entry_release(recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } @@ -186,8 +212,9 @@ rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags, void *app_context, uint64_t flags) { if (flags & FI_COMPLETION) { - rxm_cq_write(rxm_ep->util_ep.tx_cq, app_context, - comp_flags, 0, NULL, 0, 0); + (void) ofi_peer_cq_write(rxm_ep->util_ep.tx_cq, app_context, + comp_flags, 0, NULL, 0, 0, + FI_ADDR_NOTAVAIL); } } @@ -201,9 +228,9 @@ static void rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_tx_buf *rma_buf, rma_buf->flags); if (comp_flags & FI_WRITE) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_read_req); if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local && rxm_ep->msg_mr_local) { @@ -219,7 +246,7 @@ void rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_buf) rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static bool rxm_complete_sar(struct rxm_ep *rxm_ep, @@ -259,23 +286,25 @@ static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, return; rxm_cq_write_tx_comp(rxm_ep, comp_flags, app_context, tx_flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) { + struct rxm_proto_info *proto_info; + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH); - if (rx_buf->recv_entry->rndv.tx_buf) { - ofi_buf_free(rx_buf->recv_entry->rndv.tx_buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; + proto_info = rx_buf->proto_info; + if (proto_info->rndv.tx_buf) { + ofi_buf_free(proto_info); + ofi_buf_free(proto_info->rndv.tx_buf); } if (!rx_buf->ep->rdm_mr_local) - rxm_msg_mr_closev(rx_buf->mr, - rx_buf->recv_entry->rxm_iov.count); + rxm_msg_mr_closev(rx_buf->mr, rx_buf->peer_entry->count); - rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len); + rxm_finish_recv(rx_buf, rx_buf->peer_entry->msg_size); } static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, @@ -295,7 +324,7 @@ static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, ofi_buf_free(tx_buf->write_rndv.done_buf); tx_buf->write_rndv.done_buf = NULL; } - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); rxm_free_tx_buf(rxm_ep, tx_buf); } @@ -368,96 +397,135 @@ static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg) uint64_t msg_id = *((uint64_t *) arg); struct rxm_rx_buf *rx_buf; - rx_buf = container_of(item, struct rxm_rx_buf, unexp_msg.entry); + rx_buf = container_of(item, struct rxm_rx_buf, unexp_entry); return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id); } -static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) +static void rxm_init_sar_proto(struct rxm_rx_buf *rx_buf) +{ + struct rxm_proto_info *proto_info; + + proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocate proto info buffer\n"); + return; + } + if (!rx_buf->conn) { + rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, + (int) rx_buf->pkt.ctrl_hdr.conn_id); + } + + proto_info->sar.conn = rx_buf->conn; + proto_info->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; + proto_info->sar.total_recv_len = 0; + proto_info->sar.rx_entry = rx_buf->peer_entry; + + dlist_insert_tail(&proto_info->sar.entry, + &rx_buf->conn->deferred_sar_msgs); + + dlist_init(&proto_info->sar.pkt_list); + if (rx_buf->peer_entry->peer_context) + dlist_insert_tail(&rx_buf->unexp_entry, + &proto_info->sar.pkt_list); + + + rx_buf->proto_info = proto_info; +} + +int rxm_process_seg_data(struct rxm_rx_buf *rx_buf) { enum fi_hmem_iface iface; + struct rxm_proto_info *proto_info; uint64_t device; ssize_t done_len; + int done = 0; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + proto_info = rx_buf->proto_info; + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - rx_buf->recv_entry->sar.total_recv_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, + proto_info->sar.total_recv_len, rx_buf->pkt.data, rx_buf->pkt.ctrl_hdr.seg_size); assert(done_len == rx_buf->pkt.ctrl_hdr.seg_size); - rx_buf->recv_entry->sar.total_recv_len += done_len; + proto_info->sar.total_recv_len += done_len; if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) || (done_len != rx_buf->pkt.ctrl_hdr.seg_size)) { - - dlist_remove(&rx_buf->recv_entry->sar.entry); - - /* Mark rxm_recv_entry::msg_id as unknown for futher re-use */ - rx_buf->recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - - done_len = rx_buf->recv_entry->sar.total_recv_len; - rx_buf->recv_entry->sar.total_recv_len = 0; - - *done = 1; + if (!rx_buf->peer_entry->peer_context) + dlist_remove(&proto_info->sar.entry); + done_len = proto_info->sar.total_recv_len; + done = 1; + ofi_buf_free(rx_buf->proto_info); rxm_finish_recv(rx_buf, done_len); } else { - if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) { - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - - rx_buf->recv_entry->sar.conn = rx_buf->conn; - rx_buf->recv_entry->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - - dlist_insert_tail(&rx_buf->recv_entry->sar.entry, - &rx_buf->conn->deferred_sar_msgs); - } - /* The RX buffer can be reposted for further re-use */ - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; rxm_free_rx_buf(rx_buf); - - *done = 0; } + return done; } static void rxm_handle_seg_data(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; + struct fi_peer_rx_entry *rx_entry; struct rxm_conn *conn; uint64_t msg_id; struct dlist_entry *entry; - int done; - rxm_process_seg_data(rx_buf, &done); - if (done || !(rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV)) + if (dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + rxm_process_seg_data(rx_buf); return; + } + + proto_info = rx_buf->proto_info; + dlist_insert_tail(&rx_buf->unexp_entry, &proto_info->sar.pkt_list); - recv_entry = rx_buf->recv_entry; + if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST)) + dlist_remove(&proto_info->sar.entry); + + rx_entry = rx_buf->peer_entry; conn = rx_buf->conn; msg_id = rx_buf->pkt.ctrl_hdr.msg_id; dlist_foreach_container_safe(&conn->deferred_sar_segments, struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id)) + unexp_entry, entry) { + if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_entry, &msg_id)) continue; - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - rxm_process_seg_data(rx_buf, &done); - if (done) + dlist_remove(&rx_buf->unexp_entry); + rx_buf->peer_entry = rx_entry; + if (rxm_process_seg_data(rx_buf)) break; } } +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry) +{ + struct rxm_proto_info *proto_info; + struct rxm_rx_buf *rx_buf; + + rx_buf = (struct rxm_rx_buf *) peer_entry->peer_context; + proto_info = rx_buf->proto_info; + + while (!dlist_empty(&proto_info->sar.pkt_list)) { + dlist_pop_front(&proto_info->sar.pkt_list, + struct rxm_rx_buf, rx_buf, unexp_entry); + rxm_process_seg_data(rx_buf); + } + peer_entry->peer_context = NULL; + return FI_SUCCESS; +} + static ssize_t rxm_rndv_xfer(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, struct rxm_rndv_hdr *remote_hdr, struct iovec *local_iov, void **local_desc, size_t local_count, size_t total_len, @@ -508,18 +576,19 @@ ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) ssize_t ret; size_t total_len; - total_len = MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size); + total_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); + rx_buf->peer_entry->msg_size = total_len; RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ); ret = rxm_rndv_xfer(rx_buf->ep, rx_buf->conn->msg_ep, rx_buf->remote_rndv_hdr, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, total_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, total_len, rx_buf); if (ret) { - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], rx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, rx_buf, + (int) ret); } return ret; } @@ -561,9 +630,8 @@ static ssize_t rxm_rndv_handle_wr_data(struct rxm_rx_buf *rx_buf) tx_buf->rma.count, total_len, tx_buf); if (ret) - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], - tx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, tx_buf, (int) ret); + rxm_free_rx_buf(rx_buf); return ret; } @@ -592,28 +660,26 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) rx_buf->rndv_rma_index = 0; if (!rx_buf->ep->rdm_mr_local) { - total_recv_len = MIN(rx_buf->recv_entry->total_len, + total_recv_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); - ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, + ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, total_recv_len, rx_buf->ep->rndv_ops->rx_mr_access, rx_buf->mr); if (ret) return ret; - for (i = 0; (i < rx_buf->recv_entry->rxm_iov.count && + for (i = 0; (i < rx_buf->peer_entry->count && rx_buf->mr[i]); i++) { - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(rx_buf->mr[i]); + rx_buf->peer_entry->desc[i] = fi_mr_desc(rx_buf->mr[i]); } } else { struct rxm_mr *mr; - for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) { - mr = rx_buf->recv_entry->rxm_iov.desc[i]; - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(mr->msg_mr); + for (i = 0; i < rx_buf->peer_entry->count; i++) { + mr = rx_buf->peer_entry->desc[i]; + rx_buf->peer_entry->desc[i] = fi_mr_desc(mr->msg_mr); rx_buf->mr[i] = mr->msg_mr; } } @@ -627,9 +693,9 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) void rxm_handle_eager(struct rxm_rx_buf *rx_buf) { ssize_t done_len = rxm_copy_to_hmem_iov( - rx_buf->recv_entry->rxm_iov.desc, rx_buf->data, - rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0); + rx_buf->peer_entry->desc, rx_buf->data, + rx_buf->pkt.hdr.size, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -642,14 +708,14 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) uint64_t device; ssize_t done_len; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0, rx_buf->data, rx_buf->pkt.hdr.size); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -657,11 +723,11 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { struct fi_cq_tagged_entry cqe = { .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, + .op_context = rx_buf->peer_entry->context, }; rx_buf->ep->util_coll_peer_xfer_ops-> complete(rx_buf->ep->util_coll_ep, &cqe, 0); - rxm_recv_entry_release(rx_buf->recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } else { rxm_finish_recv(rx_buf, done_len); @@ -686,73 +752,26 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf) } } -static void rxm_adjust_multi_recv(struct rxm_rx_buf *rx_buf) +static inline void rxm_entry_prep_for_queue(struct fi_peer_rx_entry *rx_entry, + struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; - struct iovec new_iov; - size_t recv_size; - - recv_size = rx_buf->pkt.hdr.size; - - if (rx_buf->recv_entry->rxm_iov.iov[0].iov_len < recv_size || - rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size < - rx_buf->ep->min_multi_recv_size) - return; - - new_iov.iov_base = (uint8_t *) - rx_buf->recv_entry->rxm_iov.iov[0].iov_base + recv_size; - new_iov.iov_len = rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size;; - - rx_buf->recv_entry->rxm_iov.iov[0].iov_len = recv_size; - - recv_entry = rxm_multi_recv_entry_get(rx_buf->ep, &new_iov, - rx_buf->recv_entry->rxm_iov.desc, 1, - rx_buf->recv_entry->addr, - rx_buf->recv_entry->tag, - rx_buf->recv_entry->ignore, - rx_buf->recv_entry->context, - rx_buf->recv_entry->flags); - - rx_buf->recv_entry->flags &= ~FI_MULTI_RECV; - - dlist_insert_head(&recv_entry->entry, &rx_buf->ep->recv_queue.recv_list); -} - -static ssize_t -rxm_match_rx_buf(struct rxm_rx_buf *rx_buf, - struct rxm_recv_queue *recv_queue, - struct rxm_recv_match_attr *match_attr) -{ - struct dlist_entry *entry; - - entry = dlist_remove_first_match(&recv_queue->recv_list, - recv_queue->match_recv, match_attr); - if (entry) { - rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - - if (rx_buf->recv_entry->flags & FI_MULTI_RECV) - rxm_adjust_multi_recv(rx_buf); - - return rxm_handle_rx_buf(rx_buf); - } - - RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for incoming msg", - match_attr->addr, match_attr->tag); - FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg queue\n"); - rx_buf->unexp_msg.addr = match_attr->addr; - rx_buf->unexp_msg.tag = match_attr->tag; - - dlist_insert_tail(&rx_buf->unexp_msg.entry, - &recv_queue->unexp_msg_list); + rx_entry->peer_context = rx_buf; + rx_buf->peer_entry = rx_entry; + if (rx_buf->pkt.hdr.flags & FI_REMOTE_CQ_DATA) { + rx_entry->flags |= FI_REMOTE_CQ_DATA; + rx_entry->cq_data = rx_buf->pkt.hdr.data; + } + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); rxm_replace_rx_buf(rx_buf); - return 0; } static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_match_attr match_attr = { - .addr = FI_ADDR_UNSPEC, - }; + struct fid_peer_srx *srx = rx_buf->ep->srx; + struct fi_peer_rx_entry *rx_entry; + struct fi_peer_match_attr match = {0}; + int ret; if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) { if (rx_buf->ep->msg_srx) @@ -760,7 +779,9 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) (int) rx_buf->pkt.ctrl_hdr.conn_id); if (!rx_buf->conn) return -FI_EOTHER; - match_attr.addr = rx_buf->conn->peer->fi_addr; + match.addr = rx_buf->conn->peer->fi_addr; + } else { + match.addr = FI_ADDR_UNSPEC; } if (rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { @@ -770,33 +791,52 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) switch(rx_buf->pkt.hdr.op) { case ofi_op_msg: + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got MSG op\n"); - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue, - &match_attr); + ret = srx->owner_ops->get_msg(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_msg(rx_entry); + } + rx_entry->peer_context = NULL; + break; case ofi_op_tagged: + match.tag = rx_buf->pkt.hdr.tag; + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got TAGGED op\n"); - match_attr.tag = rx_buf->pkt.hdr.tag; - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue, - &match_attr); + ret = srx->owner_ops->get_tag(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_tag(rx_entry); + } + rx_entry->peer_context = NULL; + break; default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown op!\n"); assert(0); return -FI_EINVAL; } + rx_buf->peer_entry = rx_entry; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); + + return rxm_handle_rx_buf(rx_buf); } static int rxm_sar_match_msg_id(struct dlist_entry *item, const void *arg) { uint64_t msg_id = *((uint64_t *) arg); - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; - recv_entry = container_of(item, struct rxm_recv_entry, sar.entry); - return (msg_id == recv_entry->sar.msg_id); + proto_info = container_of(item, struct rxm_proto_info, sar.entry); + return (msg_id == proto_info->sar.msg_id); } static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) { struct dlist_entry *sar_entry; + struct rxm_proto_info *proto_info; rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, (int) rx_buf->pkt.ctrl_hdr.conn_id); @@ -812,8 +852,9 @@ static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) if (!sar_entry) return rxm_handle_recv_comp(rx_buf); - rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry, - sar.entry); + proto_info = container_of(sar_entry, struct rxm_proto_info, sar.entry); + rx_buf->peer_entry = proto_info->sar.rx_entry; + rx_buf->proto_info = proto_info; rxm_handle_seg_data(rx_buf); return 0; } @@ -831,8 +872,15 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) ret = -FI_ENOMEM; goto err; } + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + assert(0); + return; + } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info->rndv.tx_buf = buf; buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_rd_done; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; @@ -859,8 +907,9 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) return; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send rd rndv ack: %s\n", @@ -939,14 +988,22 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) goto err; } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + return -FI_ENOMEM; + } + + rx_buf->proto_info->rndv.tx_buf = buf; + buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_wr_data; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; rxm_rndv_hdr_init(rx_buf->ep, buf->pkt.data, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, rx_buf->mr); + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, rx_buf->mr); ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt) + sizeof(struct rxm_rndv_hdr), buf->hdr.desc, 0, rx_buf); @@ -970,8 +1027,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) return 0; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send wr rndv ready: %s\n", @@ -986,9 +1044,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) static void rxm_handle_remote_write(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) { - rxm_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, NULL, - comp->data, 0); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, + NULL, comp->data, 0, FI_ADDR_NOTAVAIL); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); if (comp->op_context) rxm_free_rx_buf(comp->op_context); } @@ -1222,10 +1280,7 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, } result_len = op == ofi_op_atomic ? 0 : offset; - if (op == ofi_op_atomic) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); - else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_RD); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, op); return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, result_len, FI_SUCCESS); @@ -1236,7 +1291,6 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, { struct rxm_tx_buf *tx_buf; struct rxm_atomic_resp_hdr *resp_hdr; - struct util_cntr *cntr = NULL; uint64_t len; ssize_t copy_len; ssize_t ret = 0; @@ -1286,33 +1340,15 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); - } else { - ret = -FI_EOPNOTSUPP; - goto write_err; - } + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, tx_buf->pkt.hdr.op); free: rxm_free_rx_buf(rx_buf); rxm_free_tx_buf(rxm_ep, tx_buf); return ret; write_err: - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - cntr = rxm_ep->util_ep.cntrs[CNTR_WR]; - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - cntr = rxm_ep->util_ep.cntrs[CNTR_RD]; - } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "unknown atomic request op!\n"); - assert(0); - } - rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr, - tx_buf->app_context, (int) ret); + rxm_cq_write_tx_error(rxm_ep, tx_buf->pkt.hdr.op, tx_buf->app_context, + (int) ret); goto free; } @@ -1480,23 +1516,38 @@ ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) } } -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err) +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) { struct fi_cq_err_entry err_entry = {0}; err_entry.op_context = op_context; err_entry.prov_errno = err; err_entry.err = -err; - if (cntr) - rxm_cntr_incerr(cntr); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, op); - if (ofi_cq_write_error(cq, &err_entry)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + if (ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry)) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) +{ + struct fi_cq_err_entry err_entry = {0}; + err_entry.op_context = op_context; + err_entry.prov_errno = err; + err_entry.err = -err; + + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, op); + + if (ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry)) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); +} + void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) { struct fi_cq_err_entry err_entry = {0}; @@ -1505,32 +1556,26 @@ void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) err_entry.prov_errno = err; err_entry.err = -err; if (rxm_ep->util_ep.tx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } if (rxm_ep->util_ep.rx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } - if (rxm_ep->util_ep.cntrs[CNTR_TX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_TX]); - - if (rxm_ep->util_ep.cntrs[CNTR_RX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RX]); - if (rxm_ep->util_ep.cntrs[CNTR_WR]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_WR]); - - if (rxm_ep->util_ep.cntrs[CNTR_RD]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RD]); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_write); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_read_req); } void rxm_handle_comp_error(struct rxm_ep *rxm_ep) @@ -1583,7 +1628,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_INJECT_TX: rxm_free_tx_buf(rxm_ep, err_entry.op_context); if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); return; case RXM_CREDIT_TX: case RXM_ATOMIC_RESP_SENT: /* BUG: should have consumed tx credit */ @@ -1622,7 +1667,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) * the event yet. */ rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - if (!rx_buf->recv_entry) { + if (!rx_buf->peer_entry) { ofi_buf_free((struct rxm_rx_buf *)err_entry.op_context); return; } @@ -1631,9 +1676,9 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_RNDV_WRITE_DATA_SENT: /* BUG: should fail initial send */ case RXM_RNDV_READ: rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - assert(rx_buf->recv_entry); - err_entry.op_context = rx_buf->recv_entry->context; - err_entry.flags = rx_buf->recv_entry->comp_flags; + assert(rx_buf->peer_entry); + err_entry.op_context = rx_buf->peer_entry->context; + err_entry.flags = rx_buf->peer_entry->flags; cq = rx_buf->ep->util_ep.rx_cq; cntr = rx_buf->ep->util_ep.cntrs[CNTR_RX]; @@ -1647,12 +1692,13 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) } if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); assert(cq); - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1665,8 +1711,8 @@ ssize_t rxm_thru_comp(struct rxm_ep *ep, struct fi_cq_data_entry *comp) cq = (comp->flags & (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ)) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write(cq, comp->op_context, comp->flags, comp->len, - comp->buf, comp->data, 0); + ret = ofi_peer_cq_write(cq, comp->op_context, comp->flags, comp->len, + comp->buf, comp->data, 0, FI_ADDR_NOTAVAIL); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to report completion\n"); assert(0); @@ -1692,9 +1738,10 @@ void rxm_thru_comp_error(struct rxm_ep *ep) } cq = (err_entry.flags & FI_RECV) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1730,8 +1777,8 @@ ssize_t rxm_cq_owner_write(struct fid_peer_cq *peer_cq, void *context, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, len, - buf, data, tag); + return ofi_peer_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, + len, buf, data, tag, FI_ADDR_NOTAVAIL); } ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, @@ -1751,7 +1798,7 @@ ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write_error(&rxm_cq->util_cq, &cqe_err); + return ofi_peer_cq_write_error(&rxm_cq->util_cq, &cqe_err); } int rxm_post_recv(struct rxm_rx_buf *rx_buf) @@ -1762,7 +1809,8 @@ int rxm_post_recv(struct rxm_rx_buf *rx_buf) if (rx_buf->ep->msg_srx) rx_buf->conn = NULL; rx_buf->hdr.state = RXM_RX; - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; + rx_buf->proto_info = NULL; domain = container_of(rx_buf->ep->util_ep.domain, struct rxm_domain, util_domain); @@ -1840,7 +1888,7 @@ void rxm_ep_do_progress(struct util_ep *util_ep) rxm_conn_progress(rxm_ep); } } else { - rxm_conn_progress(rxm_ep); + rxm_conn_progress(rxm_ep); } } } while ((ret > 0) && (comp_read < rxm_ep->comp_per_progress)); @@ -1957,6 +2005,9 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err1; + if (attr->flags & FI_PEER) + goto out; + rxm_domain = container_of(domain, struct rxm_domain, util_domain.domain_fid); @@ -1978,11 +2029,12 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err2; } + rxm_cq->util_cq.cq_fid.ops = &rxm_cq_ops; +out: *cq_fid = &rxm_cq->util_cq.cq_fid; /* Override util_cq_fi_ops */ (*cq_fid)->fid.ops = &rxm_cq_fi_ops; - (*cq_fid)->ops = &rxm_cq_ops; return 0; err2: diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index 055fca16bea..9fcadf56763 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -221,6 +221,25 @@ static struct fi_ops_av_owner rxm_av_owner_ops = { .ep_addr = rxm_peer_av_ep_addr, }; +static fi_addr_t rxm_get_addr(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->conn->peer->fi_addr; +} + +static void rxm_foreach_ep(struct util_av *av, struct util_ep *ep) +{ + struct rxm_ep *rxm_ep; + struct fid_peer_srx *peer_srx; + + rxm_ep = container_of(ep, struct rxm_ep, util_ep); + peer_srx = container_of(rxm_ep->srx, struct fid_peer_srx, ep_fid); + if (peer_srx) + peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &rxm_get_addr); +} + + static int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) @@ -236,7 +255,8 @@ rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, ret = rxm_util_av_open(domain_fid, attr, &fid_av_new, context, sizeof(struct rxm_conn), - ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL); + ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL, + &rxm_foreach_ep); if (ret) return ret; @@ -346,7 +366,7 @@ static struct fi_ops_domain rxm_domain_ops = { .cntr_open = rxm_cntr_open, .poll_open = fi_poll_create, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = rxm_srx_context, .query_atomic = rxm_ep_query_atomic, .query_collective = rxm_query_collective, }; diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index ba6a949122e..de375cc010d 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -42,79 +42,6 @@ #include "rxm.h" -static int rxm_match_noop(struct dlist_entry *item, const void *arg) -{ - OFI_UNUSED(item); - OFI_UNUSED(arg); - return 1; -} - -static int rxm_match_recv_entry(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr); -} - -static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr) && - ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_context(struct dlist_entry *item, const void *context) -{ - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return recv_entry->context == context; -} - -static fi_addr_t rxm_get_unexp_addr(struct rxm_unexp_msg *unexp_msg) -{ - struct rxm_rx_buf *rx_buf; - - rx_buf = container_of(unexp_msg, struct rxm_rx_buf, unexp_msg); - return (unexp_msg->addr != FI_ADDR_UNSPEC) ? - unexp_msg->addr : rx_buf->conn->peer->fi_addr; -} - -static int rxm_match_unexp_msg(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)); -} - -static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - -static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)) && - ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - static int rxm_buf_reg(struct ofi_bufpool_region *region) { struct rxm_ep *rxm_ep = region->pool->attr.context; @@ -158,6 +85,7 @@ static void rxm_init_rx_buf(struct ofi_bufpool_region *region, void *buf) fi_mr_desc((struct fid_mr *) region->context) : NULL; rx_buf->ep = ep; rx_buf->data = &rx_buf->pkt.data; + dlist_init(&rx_buf->unexp_entry); } static void rxm_init_tx_buf(struct ofi_bufpool_region *region, void *buf) @@ -186,69 +114,6 @@ static void rxm_buf_close(struct ofi_bufpool_region *region) } } -static void rxm_recv_entry_init(struct rxm_recv_entry *entry, void *arg) -{ - struct rxm_recv_queue *recv_queue = arg; - - assert(recv_queue->type != RXM_RECV_QUEUE_UNSPEC); - - entry->recv_queue = recv_queue; - entry->sar.msg_id = RXM_SAR_RX_INIT; - entry->sar.total_recv_len = 0; - /* set it to NULL to differentiate between regular ACKs and those - * sent with FI_INJECT */ - entry->rndv.tx_buf = NULL; - entry->comp_flags = FI_RECV; - - if (recv_queue->type == RXM_RECV_QUEUE_MSG) - entry->comp_flags |= FI_MSG; - else - entry->comp_flags |= FI_TAGGED; -} - -static int rxm_recv_queue_init(struct rxm_ep *rxm_ep, struct rxm_recv_queue *recv_queue, - size_t size, enum rxm_recv_queue_type type) -{ - recv_queue->rxm_ep = rxm_ep; - recv_queue->type = type; - recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, - recv_queue); - if (!recv_queue->fs) - return -FI_ENOMEM; - - dlist_init(&recv_queue->recv_list); - dlist_init(&recv_queue->unexp_msg_list); - if (type == RXM_RECV_QUEUE_MSG) { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry; - recv_queue->match_unexp = rxm_match_unexp_msg; - } else { - recv_queue->match_recv = rxm_match_noop; - recv_queue->match_unexp = rxm_match_noop; - } - } else { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry_tag_addr; - recv_queue->match_unexp = rxm_match_unexp_msg_tag_addr; - } else { - recv_queue->match_recv = rxm_match_recv_entry_tag; - recv_queue->match_unexp = rxm_match_unexp_msg_tag; - } - } - - return 0; -} - -static void rxm_recv_queue_close(struct rxm_recv_queue *recv_queue) -{ - /* It indicates that the recv_queue were allocated */ - if (recv_queue->fs) { - rxm_recv_fs_free(recv_queue->fs); - recv_queue->fs = NULL; - } - // TODO cleanup recv_list and unexp msg list -} - static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) { struct ofi_bufpool_attr attr = {0}; @@ -287,8 +152,18 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) "Unable to create peer xfer context pool\n"); goto free_tx_pool; } - return 0; + attr.size = sizeof(struct rxm_proto_info); + attr.alloc_fn = NULL; + attr.free_fn = NULL; + attr.init_fn = NULL; + ret = ofi_bufpool_create_attr(&attr, &rxm_ep->proto_info_pool); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Unable to create proto info pool\n"); + goto free_tx_pool; + } + return 0; free_tx_pool: ofi_bufpool_destroy(rxm_ep->tx_pool); @@ -298,62 +173,13 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) return ret; } -static int rxm_multi_recv_pool_init(struct rxm_ep *rxm_ep) -{ - struct ofi_bufpool_attr attr = { - .size = sizeof(struct rxm_recv_entry), - .alignment = 16, - .max_cnt = 0, - .chunk_cnt = 16, - .alloc_fn = NULL, - .init_fn = NULL, - .context = rxm_ep, - .flags = OFI_BUFPOOL_NO_TRACK, - }; - - return ofi_bufpool_create_attr(&attr, &rxm_ep->multi_recv_pool); -} - -static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep) -{ - int ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->recv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_MSG); - if (ret) - return ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->trecv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_TAGGED); - if (ret) - goto err_recv_tag; - - ret = rxm_multi_recv_pool_init(rxm_ep); - if (ret) - goto err_multi; - - return FI_SUCCESS; - -err_multi: - rxm_recv_queue_close(&rxm_ep->trecv_queue); -err_recv_tag: - rxm_recv_queue_close(&rxm_ep->recv_queue); - return ret; -} - /* It is safe to call this function, even if `rxm_ep_txrx_res_open` * has not yet been called */ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) { - rxm_recv_queue_close(&ep->trecv_queue); - rxm_recv_queue_close(&ep->recv_queue); + if (ep->srx && ep->util_ep.ep_fid.msg != &rxm_no_recv_msg_ops) + (void) util_srx_close(&ep->srx->ep_fid.fid); - if (ep->multi_recv_pool) { - ofi_bufpool_destroy(ep->multi_recv_pool); - ep->multi_recv_pool = NULL; - } if (ep->rx_pool) { ofi_bufpool_destroy(ep->rx_pool); ep->rx_pool = NULL; @@ -362,6 +188,10 @@ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) ofi_bufpool_destroy(ep->tx_pool); ep->tx_pool = NULL; } + if (ep->proto_info_pool) { + ofi_bufpool_destroy(ep->proto_info_pool); + ep->proto_info_pool = NULL; + } if (ep->coll_pool) { ofi_bufpool_destroy(ep->coll_pool); ep->coll_pool = NULL; @@ -420,53 +250,13 @@ static struct rxm_eager_ops coll_eager_ops = { .handle_rx = rxm_handle_coll_eager, }; -static bool rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, - struct rxm_recv_queue *recv_queue, void *context) -{ - struct fi_cq_err_entry err_entry; - struct rxm_recv_entry *recv_entry; - struct dlist_entry *entry; - int ret; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - entry = dlist_remove_first_match(&recv_queue->recv_list, - rxm_match_recv_entry_context, - context); - if (!entry) - goto unlock; - - recv_entry = container_of(entry, struct rxm_recv_entry, entry); - memset(&err_entry, 0, sizeof(err_entry)); - err_entry.op_context = recv_entry->context; - err_entry.flags |= recv_entry->comp_flags; - err_entry.tag = recv_entry->tag; - err_entry.err = FI_ECANCELED; - err_entry.prov_errno = -FI_ECANCELED; - rxm_recv_entry_release(recv_entry); - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - assert(0); - } - -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return entry != NULL; -} - static ssize_t rxm_ep_cancel(fid_t fid_ep, void *context) { struct rxm_ep *ep; ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid); - if (rxm_passthru_info(ep->rxm_info)) - return fi_cancel(&ep->msg_srx->fid, context); - - if (!rxm_ep_cancel_recv(ep, &ep->trecv_queue, context)) - rxm_ep_cancel_recv(ep, &ep->recv_queue, context); - - return 0; + return ep->srx->ep_fid.ops->cancel(&ep->srx->ep_fid.fid, context); } static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, @@ -480,10 +270,8 @@ static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - assert(sizeof(rxm_ep->min_multi_recv_size) == sizeof(size_t)); - *(size_t *)optval = rxm_ep->min_multi_recv_size; - *optlen = sizeof(size_t); - break; + return rxm_ep->srx->ep_fid.ops->getopt(&rxm_ep->srx->ep_fid.fid, + level, optname, optval, optlen); case FI_OPT_BUFFERED_MIN: assert(sizeof(rxm_ep->buffered_min) == sizeof(size_t)); *(size_t *)optval = rxm_ep->buffered_min; @@ -508,10 +296,7 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname, switch (optname) { case FI_OPT_MIN_MULTI_RECV: rxm_ep->min_multi_recv_size = *(size_t *)optval; - FI_INFO(&rxm_prov, FI_LOG_CORE, - "FI_OPT_MIN_MULTI_RECV set to %zu\n", - rxm_ep->min_multi_recv_size); - break; + return ret; case FI_OPT_BUFFERED_MIN: if (rxm_ep->rx_pool) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, @@ -564,99 +349,6 @@ static struct fi_ops_ep rxm_ops_ep = { .tx_size_left = fi_no_tx_size_left, }; - -/* Caller must hold recv_queue->lock -- TODO which lock? */ -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - - if (dlist_empty(&recv_queue->unexp_msg_list)) - return NULL; - - match_attr.addr = addr; - match_attr.tag = tag; - match_attr.ignore = ignore; - - entry = dlist_find_first_match(&recv_queue->unexp_msg_list, - recv_queue->match_unexp, &match_attr); - if (!entry) - return NULL; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Match for posted recv found in unexp" - " msg list\n", match_attr.addr, match_attr.tag); - - return container_of(entry, struct rxm_rx_buf, unexp_msg.entry); -} - -static void rxm_recv_entry_init_common(struct rxm_recv_entry *recv_entry, - const struct iovec *iov, void **desc, size_t count, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, - void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - size_t i; - - assert(!recv_entry->rndv.tx_buf); - recv_entry->rxm_iov.count = (uint8_t) count; - recv_entry->addr = src_addr; - recv_entry->context = context; - recv_entry->flags = flags; - recv_entry->ignore = ignore; - recv_entry->tag = tag; - - recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - recv_entry->sar.total_recv_len = 0; - recv_entry->total_len = 0; - - for (i = 0; i < count; i++) { - recv_entry->rxm_iov.iov[i] = iov[i]; - recv_entry->total_len += iov[i].iov_len; - if (desc && desc[i]) - recv_entry->rxm_iov.desc[i] = desc[i]; - else - recv_entry->rxm_iov.desc[i] = NULL; - } -} - -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue) -{ - struct rxm_recv_entry *recv_entry; - - if (ofi_freestack_isempty(recv_queue->fs)) - return NULL; - - recv_entry = ofi_freestack_pop(recv_queue->fs); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, recv_queue); - - return recv_entry; -} - -struct rxm_recv_entry * -rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - - recv_entry = ofi_buf_alloc(rxm_ep->multi_recv_pool); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, NULL); - - recv_entry->comp_flags = FI_MSG | FI_RECV; - return recv_entry; -} - struct rxm_tx_buf *rxm_get_tx_buf(struct rxm_ep *ep) { struct rxm_tx_buf *buf; @@ -746,9 +438,8 @@ rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, { rxm_ep_sar_tx_cleanup(def_tx_entry->rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->sar_seg.cur_seg_tx_buf); - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->sar_seg.app_context, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->sar_seg.app_context, (int) ret); } /* Returns FI_SUCCESS if the SAR deferred TX queue is empty, @@ -821,6 +512,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) { struct rxm_deferred_tx_entry *def_tx_entry; + struct rxm_proto_info *proto_info; struct iovec iov; struct fi_msg msg; ssize_t ret = 0; @@ -833,24 +525,22 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_deferred_tx_entry, entry); switch (def_tx_entry->type) { case RXM_DEFERRED_TX_RNDV_ACK: + proto_info = def_tx_entry->rndv_ack.rx_buf->proto_info; ret = fi_send(def_tx_entry->rxm_conn->msg_ep, - &def_tx_entry->rndv_ack.rx_buf-> - recv_entry->rndv.tx_buf->pkt, + &proto_info->rndv.tx_buf->pkt, def_tx_entry->rndv_ack.pkt_size, - def_tx_entry->rndv_ack.rx_buf->recv_entry-> - rndv.tx_buf->hdr.desc, + proto_info->rndv.tx_buf->hdr.desc, 0, def_tx_entry->rndv_ack.rx_buf); if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_ack.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_ack.rx_buf-> + peer_entry->context, (int) ret); } - if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv - .tx_buf->pkt.ctrl_hdr - .type == rxm_ctrl_rndv_rd_done) + if (proto_info->rndv.tx_buf->pkt.ctrl_hdr.type == + rxm_ctrl_rndv_rd_done) RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_ack.rx_buf, RXM_RNDV_READ_DONE_SENT); @@ -868,9 +558,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->rndv_done.tx_buf, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, + ofi_op_msg, + def_tx_entry->rndv_done.tx_buf, + (int) ret); } RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_done.tx_buf, @@ -888,10 +579,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_read.rx_buf-> + peer_entry->context, (int) ret); } break; case RXM_DEFERRED_TX_RNDV_WRITE: @@ -906,9 +597,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_write.tx_buf, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_write.tx_buf, + (int) ret); } break; case RXM_DEFERRED_TX_SAR_SEG: @@ -939,11 +631,12 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, OFI_PRIORITY); if (ret) { if (ret != -FI_EAGAIN) { - rxm_cq_write_error( - def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, + ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + peer_entry->context, + (int) ret); } return; } @@ -1449,11 +1142,9 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) sizeof(struct rxm_rndv_hdr))), rxm_buffer_size); - assert(!rxm_ep->min_multi_recv_size); - rxm_ep->min_multi_recv_size = rxm_buffer_size; - assert(!rxm_ep->buffered_limit); rxm_ep->buffered_limit = rxm_buffer_size; + rxm_ep->min_multi_recv_size = rxm_buffer_size; rxm_config_direct_send(rxm_ep); rxm_ep_init_proto(rxm_ep); @@ -1463,13 +1154,11 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) "\t\t MR local: MSG - %d, RxM - %d\n" "\t\t Completions per progress: MSG - %zu\n" "\t\t Buffered min: %zu\n" - "\t\t Min multi recv size: %zu\n" "\t\t inject size: %zu\n" "\t\t Protocol limits: Eager: %zu, SAR: %zu\n", rxm_ep->msg_mr_local, rxm_ep->rdm_mr_local, rxm_ep->comp_per_progress, rxm_ep->buffered_min, - rxm_ep->min_multi_recv_size, rxm_ep->inject_limit, - rxm_ep->eager_limit, rxm_ep->sar_limit); + rxm_ep->inject_limit, rxm_ep->eager_limit, rxm_ep->sar_limit); } static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) @@ -1482,19 +1171,7 @@ static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) dlist_init(&rxm_ep->deferred_queue); - ret = rxm_ep_rx_queue_init(rxm_ep); - if (ret) - goto err; - return FI_SUCCESS; -err: - ofi_bufpool_destroy(rxm_ep->coll_pool); - ofi_bufpool_destroy(rxm_ep->rx_pool); - ofi_bufpool_destroy(rxm_ep->tx_pool); - rxm_ep->coll_pool = NULL; - rxm_ep->rx_pool = NULL; - rxm_ep->tx_pool = NULL; - return ret; } static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) @@ -1524,9 +1201,129 @@ static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) return 0; } +static int rxm_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg ? + rxm_handle_unexp_sar(rx_entry): + rxm_handle_rx_buf(rx_buf); +} + +static int rxm_discard(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf, *seg_rx; + + rx_buf = rx_entry->peer_context; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) { + while (!dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + dlist_pop_front(&rx_buf->proto_info->sar.pkt_list, + struct rxm_rx_buf, seg_rx, unexp_entry); + rxm_free_rx_buf(seg_rx); + } + ofi_buf_free(rx_buf->proto_info); + } + + rxm_free_rx_buf(rx_buf); + return FI_SUCCESS; +} + +struct fi_ops_srx_peer rxm_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = rxm_unexp_start, + .start_tag = rxm_unexp_start, + .discard_msg = rxm_discard, + .discard_tag = rxm_discard, +}; + +static int rxm_srx_close(struct fid *fid) +{ + struct rxm_domain *domain = container_of(fid, struct rxm_domain, + rx_ep.fid); + + ofi_atomic_dec32(&domain->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops rxm_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = rxm_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg rxm_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged rxm_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct rxm_domain *rxm_domain; + + if (!(attr->op_flags & FI_PEER)) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "shared srx only supported with FI_PEER flag\n"); + return -FI_EINVAL; + } + + rxm_domain = container_of(domain, struct rxm_domain, + util_domain.domain_fid); + + if (rxm_domain->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Peer SRX context already imported\n"); + return -FI_EINVAL; + } + + rxm_domain->srx = ((struct fi_peer_srx_context *) + (context))->srx; + rxm_domain->srx->peer_ops = &rxm_srx_peer_ops; + rxm_domain->rx_ep.msg = &rxm_srx_msg_ops; + rxm_domain->rx_ep.tagged = &rxm_srx_tagged_ops; + rxm_domain->rx_ep.fid.ops = &rxm_srx_fi_ops; + rxm_domain->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &rxm_domain->rx_ep; + ofi_atomic_inc32(&rxm_domain->util_domain.ref); + + return FI_SUCCESS; +} + +static void rxm_update(struct util_srx_ctx *srx, struct util_rx_entry *rx_entry) +{ + //no update needed +} + static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) { struct rxm_ep *ep; + struct rxm_domain *domain; + struct fid_ep *srx; int ret; ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid); @@ -1562,6 +1359,32 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) return ret; + if (!ep->srx) { + domain = container_of(ep->util_ep.domain, + struct rxm_domain, + util_domain.domain_fid); + ret = util_ep_srx_context(&domain->util_domain, + ep->rxm_info->rx_attr->size, + RXM_IOV_LIMIT, ep->min_multi_recv_size, + &rxm_update, &ep->util_ep.lock, + &srx); + if (ret) + return ret; + + ep->srx = container_of(srx, struct fid_peer_srx, + ep_fid.fid); + ep->srx->peer_ops = &rxm_srx_peer_ops; + + ret = util_srx_bind(&ep->srx->ep_fid.fid, + &ep->util_ep.rx_cq->cq_fid.fid, + FI_RECV); + if (ret) + return ret; + } else { + ep->util_ep.ep_fid.msg = &rxm_no_recv_msg_ops; + ep->util_ep.ep_fid.tagged = &rxm_no_recv_tagged_ops; + } + if (ep->msg_srx && !rxm_passthru_info(ep->rxm_info)) { ret = rxm_prepost_recv(ep, ep->msg_srx); if (ret) @@ -1590,10 +1413,21 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct rxm_av *rxm_av; struct rxm_cq *rxm_cq; struct rxm_eq *rxm_eq; - int ret, retv = 0; + int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + if (bfid->fclass == FI_CLASS_SRX_CTX) { + if (rxm_ep->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "SRX context already bound to EP\n"); + return -FI_EINVAL; + } + rxm_ep->srx = + (container_of(bfid, struct rxm_domain, rx_ep.fid))->srx; + return FI_SUCCESS; + } + ret = ofi_ep_bind(&rxm_ep->util_ep, bfid, flags); if (ret) return ret; @@ -1606,14 +1440,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_av->util_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_av->offload_coll_av) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_av->offload_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1624,14 +1458,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_cq->util_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_cq->offload_coll_cq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_cq->offload_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1642,19 +1476,18 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_eq->util_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_eq->offload_coll_eq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_eq->offload_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } - break; } - return retv; + return FI_SUCCESS; } static struct fi_ops rxm_ep_fi_ops = { diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 78610bc5f04..10a7ae535d7 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -262,6 +262,9 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->op_flags &= ~FI_MULTI_RECV; + core_info->domain_attr->caps &= ~(FI_AV_USER_ID | FI_PEER); + core_info->caps &= ~(FI_AV_USER_ID | FI_PEER); + return 0; } diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 46cd1cfe285..5d48e88e53a 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -40,214 +40,16 @@ #include "rxm.h" - -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - bool last; - ssize_t ret; - - ret = rxm_handle_rx_buf(rx_buf); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST; - if (ret || last) - return ret; - - match_attr.addr = recv_entry->addr; - match_attr.tag = recv_entry->tag; - match_attr.ignore = recv_entry->ignore; - - dlist_foreach_container_safe(&recv_queue->unexp_msg_list, - struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!recv_queue->match_unexp(&rx_buf->unexp_msg.entry, - &match_attr)) - continue; - /* Handle unordered completions from MSG provider */ - if ((rx_buf->pkt.ctrl_hdr.msg_id != recv_entry->sar.msg_id) || - ((rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg))) - continue; - - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - if (recv_entry->sar.conn != rx_buf->conn) - continue; - rx_buf->recv_entry = recv_entry; - dlist_remove(&rx_buf->unexp_msg.entry); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == - RXM_SAR_SEG_LAST; - ret = rxm_handle_rx_buf(rx_buf); - if (ret || last) - break; - } - return ret; -} - -/* - * We don't expect to have unexpected messages when the app is using - * multi-recv buffers. Optimize for that case. - * - * If there are unexpected messages waiting when we post a mult-recv buffer, - * we trim off the start of the buffer, treat it as a normal buffer, and pair - * it with an unexpected message. We continue doing this until either no - * unexpected messages are left or the multi-recv buffer has been consumed. - */ -static ssize_t -rxm_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, - void **desc, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - struct iovec cur_iov = *iov; - ssize_t ret; - - do { - recv_entry = rxm_recv_entry_get(ep, &cur_iov, desc, 1, - FI_ADDR_UNSPEC, 0, 0, context, - op_flags, &ep->recv_queue); - if (!recv_entry) { - ret = -FI_ENOMEM; - break; - } - - rx_buf = rxm_get_unexp_msg(&ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &ep->recv_queue.recv_list); - return 0; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - recv_entry->flags &= ~FI_MULTI_RECV; - recv_entry->total_len = MIN(cur_iov.iov_len, rx_buf->pkt.hdr.size); - recv_entry->rxm_iov.iov[0].iov_len = recv_entry->total_len; - - cur_iov.iov_base = (uint8_t *) cur_iov.iov_base + recv_entry->total_len; - cur_iov.iov_len -= recv_entry->total_len; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - ret = rxm_handle_rx_buf(rx_buf); - else - ret = rxm_handle_unexp_sar(&ep->recv_queue, recv_entry, - rx_buf); - - } while (!ret && cur_iov.iov_len >= ep->min_multi_recv_size); - - if ((cur_iov.iov_len < ep->min_multi_recv_size) || - (ret && cur_iov.iov_len != iov->iov_len)) { - rxm_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, - 0, NULL, 0, 0); - } - - return ret; -} - -static ssize_t -rxm_recv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - ssize_t ret; - - assert(rxm_ep->util_ep.rx_cq); - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (op_flags & FI_MULTI_RECV) { - ret = rxm_post_mrecv(rxm_ep, iov, desc, context, op_flags); - goto release; - } - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - 0, 0, context, op_flags, - &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto release; - } - - rx_buf = rxm_get_unexp_msg(&rxm_ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->recv_queue.recv_list); - ret = FI_SUCCESS; - goto release; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - ret = (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) ? - rxm_handle_rx_buf(rx_buf) : - rxm_handle_unexp_sar(&rxm_ep->recv_queue, recv_entry, rx_buf); - -release: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - -static ssize_t -rxm_buf_recv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx = context; - struct rxm_rx_buf *rx_buf; - ssize_t ret = 0; - - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, - src_addr, 0, 0, context, - flags, &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - recv_entry->comp_flags |= FI_CLAIM; - - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); - } else { - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - } -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - return rxm_buf_recv(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); - - return rxm_recv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, + flags | rxm_ep->util_ep.rx_msg_flags); } @@ -262,8 +64,9 @@ rxm_recv(struct fid_ep *ep_fid, void *buf, size_t len, .iov_len = len, }; - return rxm_recv_common(rxm_ep, &iov, &desc, 1, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -273,8 +76,9 @@ rxm_recvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_recv_common(rxm_ep, iov, desc, count, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -661,15 +465,13 @@ rxm_send_eager(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, eager_buf->app_context = context; eager_buf->flags = flags; + rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, + flags, &eager_buf->pkt); if (rxm_use_direct_send(rxm_ep, count, flags)) { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_direct_send(rxm_ep, rxm_conn, eager_buf, iov, desc, count); } else { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_copy_from_hmem_iov(desc, eager_buf->pkt.data, eager_buf->pkt.hdr.size, iov, count, 0); @@ -891,6 +693,19 @@ struct fi_ops_msg rxm_msg_ops = { .injectdata = rxm_injectdata, }; +struct fi_ops_msg rxm_no_recv_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = rxm_send, + .sendv = rxm_sendv, + .sendmsg = rxm_sendmsg, + .inject = rxm_inject, + .senddata = rxm_senddata, + .injectdata = rxm_injectdata, +}; + static ssize_t rxm_recv_thru(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) diff --git a/prov/rxm/src/rxm_tagged.c b/prov/rxm/src/rxm_tagged.c index 78e3d3ff0e9..1276bac0ba3 100644 --- a/prov/rxm/src/rxm_tagged.c +++ b/prov/rxm/src/rxm_tagged.c @@ -43,188 +43,21 @@ #include "rxm.h" -static void -rxm_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, - void *context) -{ - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", - rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); - rxm_free_rx_buf(rx_buf); -} - -static void -rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - struct rxm_rx_buf *rx_buf; - int ret; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Peeking message", addr, tag); - - /* peek doesn't support peer transfer at this moment */ - assert(!(flags & FI_PEER_TRANSFER)); - - rxm_ep_do_progress(&rxm_ep->util_ep); - - rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); - if (!rx_buf) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - ret = ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag, - context); - if (ret) - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - return; - } - - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message found\n"); - - if (flags & FI_DISCARD) { - dlist_remove(&rx_buf->unexp_msg.entry); - rxm_discard_recv(rxm_ep, rx_buf, context); - return; - } - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Marking message for Claim\n"); - ((struct fi_context *)context)->internal[0] = rx_buf; - dlist_remove(&rx_buf->unexp_msg.entry); - } - - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); -} - -static ssize_t -rxm_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags, - &rxm_ep->trecv_queue); - if (!recv_entry) - return -FI_EAGAIN; - - rx_buf = rxm_get_unexp_msg(&rxm_ep->trecv_queue, recv_entry->addr, - recv_entry->tag, recv_entry->ignore); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->trecv_queue.recv_list); - return FI_SUCCESS; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - return rxm_handle_rx_buf(rx_buf); - else - return rxm_handle_unexp_sar(&rxm_ep->trecv_queue, recv_entry, - rx_buf); -} - -static ssize_t -rxm_trecv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t op_flags) -{ - ssize_t ret; - - if (op_flags & FI_PEER_TRANSFER) - tag |= RXM_PEER_XFER_TAG_FLAG; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - ret = rxm_post_trecv(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags); - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, uint64_t flags) { - struct rxm_ep *rxm_ep; - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx; - struct rxm_rx_buf *rx_buf; - void *context = msg->context; - ssize_t ret = 0; - - rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - flags |= rxm_ep->util_ep.rx_msg_flags; - - if (!(flags & (FI_CLAIM | FI_PEEK)) && - !(rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV)) { - return rxm_trecv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags); - } - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) { - recv_ctx = msg->context; - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - goto claim; - } - - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - goto unlock; - } - - if (flags & FI_PEEK) { - rxm_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore, - context, flags, &rxm_ep->trecv_queue); - goto unlock; - } - - rx_buf = ((struct fi_context *) context)->internal[0]; - assert(rx_buf); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n"); - - if (flags & FI_DISCARD) { - rxm_discard_recv(rxm_ep, rx_buf, context); - goto unlock; - } - -claim: - assert (flags & FI_CLAIM); - recv_entry = rxm_recv_entry_get(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags, - &rxm_ep->trecv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - recv_entry->comp_flags |= FI_CLAIM; + uint64_t tag = msg->tag; + struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, + util_ep.ep_fid.fid); - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); + if (flags & FI_PEER_TRANSFER) + tag |= RXM_PEER_XFER_TAG_FLAG; -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, tag, msg->ignore, + flags | rxm_ep->util_ep.rx_msg_flags); } static ssize_t @@ -239,8 +72,9 @@ rxm_trecv(struct fid_ep *ep_fid, void *buf, size_t len, }; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -251,8 +85,9 @@ rxm_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, iov, desc, count, src_addr, tag, - ignore, context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -371,7 +206,7 @@ rxm_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data, rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA, - tag, ofi_op_tagged); + tag, ofi_op_tagged); unlock: ofi_genlock_unlock(&rxm_ep->util_ep.lock); return ret; @@ -415,6 +250,18 @@ struct fi_ops_tagged rxm_tagged_ops = { .injectdata = rxm_tinjectdata, }; +struct fi_ops_tagged rxm_no_recv_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = rxm_tsend, + .sendv = rxm_tsendv, + .sendmsg = rxm_tsendmsg, + .inject = rxm_tinject, + .senddata = rxm_tsenddata, + .injectdata = rxm_tinjectdata, +}; static ssize_t rxm_trecv_thru(struct fid_ep *ep_fid, void *buf, size_t len, diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index d71f3903f12..d4287fdba88 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -159,6 +159,7 @@ struct smr_domain { int fast_rma; /* cache for use with hmem ipc */ struct ofi_mr_cache *ipc_cache; + struct fid_ep rx_ep; struct fid_peer_srx *srx; }; @@ -220,7 +221,7 @@ struct smr_ep { const char *name; uint64_t msg_id; struct smr_region *volatile region; - struct fid_ep *srx; + struct fid_peer_srx *srx; struct ofi_bufpool *cmd_ctx_pool; struct ofi_bufpool *unexp_buf_pool; struct ofi_bufpool *pend_buf_pool; @@ -228,6 +229,7 @@ struct smr_ep { struct smr_tx_fs *tx_fs; struct dlist_entry sar_list; struct dlist_entry ipc_cpy_pend_list; + size_t min_multi_recv_size; int ep_idx; enum ofi_shm_p2p_type p2p_type; @@ -236,11 +238,6 @@ struct smr_ep { void (*smr_progress_ipc_list)(struct smr_ep *ep); }; -static inline struct fid_peer_srx *smr_get_peer_srx(struct smr_ep *ep) -{ - return container_of(ep->srx, struct fid_peer_srx, ep_fid); -} - #define smr_ep_rx_flags(smr_ep) ((smr_ep)->util_ep.rx_op_flags) #define smr_ep_tx_flags(smr_ep) ((smr_ep)->util_ep.tx_op_flags) @@ -251,9 +248,6 @@ static inline int smr_mmap_name(char *shm_name, const char *ep_name, ep_name, msg_id); } -int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, - struct fid_ep **rx_ep, void *context); - int smr_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id); @@ -297,10 +291,8 @@ int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint64_t flags, size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data); -static inline uint64_t smr_rx_cq_flags(uint32_t op, uint64_t rx_flags, - uint16_t op_flags) +static inline uint64_t smr_rx_cq_flags(uint64_t rx_flags, uint16_t op_flags) { - rx_flags |= ofi_rx_cq_flags(op); if (op_flags & SMR_REMOTE_CQ_DATA) rx_flags |= FI_REMOTE_CQ_DATA; return rx_flags; diff --git a/prov/shm/src/smr_attr.c b/prov/shm/src/smr_attr.c index 7b300f118bb..c1e987ec50d 100644 --- a/prov/shm/src/smr_attr.c +++ b/prov/shm/src/smr_attr.c @@ -149,8 +149,7 @@ struct fi_info smr_hmem_info = { }; struct fi_info smr_info = { - .caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_MULTI_RECV | FI_LOCAL_COMM, - SMR_DOMAIN_CAPS, + .caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_MULTI_RECV | SMR_DOMAIN_CAPS, .addr_format = FI_ADDR_STR, .tx_attr = &smr_tx_attr, .rx_attr = &smr_rx_attr, diff --git a/prov/shm/src/smr_av.c b/prov/shm/src/smr_av.c index 355d3bcad64..61e4344bde5 100644 --- a/prov/shm/src/smr_av.c +++ b/prov/shm/src/smr_av.c @@ -69,9 +69,12 @@ static void smr_map_cleanup(struct smr_map *map) { int64_t i; - for (i = 0; i < SMR_MAX_PEERS; i++) - smr_map_del(map, i); + for (i = 0; i < SMR_MAX_PEERS; i++) { + if (map->peers[i].peer.id < 0) + continue; + smr_map_del(map, i); + } ofi_rbmap_cleanup(&map->rbmap); } @@ -113,7 +116,6 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, struct util_ep *util_ep; struct smr_av *smr_av; struct smr_ep *smr_ep; - struct fid_peer_srx *srx; struct dlist_entry *av_entry; fi_addr_t util_addr; int64_t shm_id = -1; @@ -173,8 +175,8 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, smr_ep = container_of(util_ep, struct smr_ep, util_ep); smr_ep->region->max_sar_buf_per_peer = SMR_MAX_PEERS / smr_av->smr_map.num_peers; - srx = smr_get_peer_srx(smr_ep); - srx->owner_ops->foreach_unspec_addr(srx, &smr_get_addr); + smr_ep->srx->owner_ops->foreach_unspec_addr(smr_ep->srx, + &smr_get_addr); } } @@ -211,7 +213,6 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count dlist_foreach(&util_av->ep_list, av_entry) { util_ep = container_of(av_entry, struct util_ep, av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_unmap_from_endpoint(smr_ep->region, id); if (smr_av->smr_map.num_peers > 0) smr_ep->region->max_sar_buf_per_peer = SMR_MAX_PEERS / diff --git a/prov/shm/src/smr_domain.c b/prov/shm/src/smr_domain.c index 188cea31410..909298fbc2d 100644 --- a/prov/shm/src/smr_domain.c +++ b/prov/shm/src/smr_domain.c @@ -35,6 +35,78 @@ #include "smr.h" +extern struct fi_ops_srx_peer smr_srx_peer_ops; + +static int smr_srx_close(struct fid *fid) +{ + struct smr_domain *domain = container_of(fid, struct smr_domain, + rx_ep.fid); + + ofi_atomic_dec32(&domain->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops smr_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = smr_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg smr_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged smr_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +static int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct smr_domain *smr_domain; + + smr_domain = container_of(domain, struct smr_domain, + util_domain.domain_fid); + + if (attr->op_flags & FI_PEER) { + smr_domain->srx = ((struct fi_peer_srx_context *) + (context))->srx; + smr_domain->srx->peer_ops = &smr_srx_peer_ops; + smr_domain->rx_ep.msg = &smr_srx_msg_ops; + smr_domain->rx_ep.tagged = &smr_srx_tagged_ops; + smr_domain->rx_ep.fid.ops = &smr_srx_fi_ops; + smr_domain->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &smr_domain->rx_ep; + ofi_atomic_inc32(&smr_domain->util_domain.ref); + return FI_SUCCESS; + } + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "shared srx only supported with FI_PEER flag\n"); + return -FI_EINVAL; +} + + static struct fi_ops_domain smr_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = smr_av_open, diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index 8803495e382..82bc95200b1 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -119,8 +119,8 @@ int smr_ep_getopt(fid_t fid, int level, int optname, void *optval, struct smr_ep *smr_ep = container_of(fid, struct smr_ep, util_ep.ep_fid); - return smr_ep->srx->ops->getopt(&smr_ep->srx->fid, level, optname, - optval, optlen); + return smr_ep->srx->ep_fid.ops->getopt(&smr_ep->srx->ep_fid.fid, level, + optname, optval, optlen); } int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, @@ -128,14 +128,12 @@ int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, { struct smr_ep *smr_ep = container_of(fid, struct smr_ep, util_ep.ep_fid); - struct util_srx_ctx *srx; if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; if (optname == FI_OPT_MIN_MULTI_RECV) { - srx = util_get_peer_srx(smr_ep->srx)->ep_fid.fid.context; - srx->min_multi_recv_size = *(size_t *)optval; + smr_ep->min_multi_recv_size = *(size_t *)optval; return FI_SUCCESS; } @@ -159,7 +157,7 @@ static ssize_t smr_ep_cancel(fid_t ep_fid, void *context) struct smr_ep *ep; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid); - return ep->srx->ops->cancel(&ep->srx->fid, context); + return ep->srx->ep_fid.ops->cancel(&ep->srx->ep_fid.fid, context); } static struct fi_ops_ep smr_ep_ops = { @@ -223,7 +221,9 @@ int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr) return id; if (!ep->region->map->peers[id].region) { + ofi_spin_lock(&ep->region->map->lock); ret = smr_map_to_region(&smr_prov, ep->region->map, id); + ofi_spin_unlock(&ep->region->map->lock); if (ret) return -1; } @@ -808,9 +808,7 @@ static int smr_ep_close(struct fid *fid) if (ep->srx) { /* shm is an owner provider */ if (ep->util_ep.ep_fid.msg != &smr_no_recv_msg_ops) - (void) util_srx_close(&ep->srx->fid); - else /* shm is a peer provider */ - free(ep->srx); + (void) util_srx_close(&ep->srx->ep_fid.fid); } ofi_endpoint_close(&ep->util_ep); @@ -1062,30 +1060,11 @@ static void smr_update(struct util_srx_ctx *srx, struct util_rx_entry *rx_entry) //by another provider } -int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, - struct fid_ep **rx_ep, void *context) -{ - struct smr_domain *smr_domain; - - smr_domain = container_of(domain, struct smr_domain, - util_domain.domain_fid); - - if (attr->op_flags & FI_PEER) { - smr_domain->srx = ((struct fi_peer_srx_context *) - (context))->srx; - return FI_SUCCESS; - } - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "shared srx only supported with FI_PEER flag\n"); - return -FI_EINVAL; -} - static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) { struct smr_ep *ep; struct util_av *av; int ret = 0; - struct fid_peer_srx *srx, *srx_b; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); switch (bfid->fclass) { @@ -1109,16 +1088,10 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct util_cntr, cntr_fid.fid), flags); break; case FI_CLASS_SRX_CTX: - srx = calloc(1, sizeof(*srx)); - srx_b = container_of(bfid, struct fid_peer_srx, ep_fid.fid); - srx->peer_ops = &smr_srx_peer_ops; - srx->owner_ops = srx_b->owner_ops; - srx->ep_fid.fid.context = srx_b->ep_fid.fid.context; - ep->srx = &srx->ep_fid; + ep->srx = (container_of(bfid, struct smr_domain, rx_ep.fid))->srx; break; default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "invalid fid class\n"); + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "invalid fid class\n"); ret = -FI_EINVAL; break; } @@ -1131,6 +1104,7 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) struct smr_domain *domain; struct smr_ep *ep; struct smr_av *av; + struct fid_ep *srx; int ret; ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid); @@ -1170,16 +1144,18 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) util_domain.domain_fid); ret = util_ep_srx_context(&domain->util_domain, ep->rx_size, SMR_IOV_LIMIT, - SMR_INJECT_SIZE, &smr_update, - &ep->util_ep.lock, &ep->srx); + ep->min_multi_recv_size, &smr_update, + &ep->util_ep.lock, &srx); if (ret) return ret; - util_get_peer_srx(ep->srx)->peer_ops = - &smr_srx_peer_ops; - ret = util_srx_bind(&ep->srx->fid, - &ep->util_ep.rx_cq->cq_fid.fid, - FI_RECV); + ep->srx = container_of(srx, struct fid_peer_srx, + ep_fid.fid); + ep->srx->peer_ops = &smr_srx_peer_ops; + + ret = util_srx_bind(&ep->srx->ep_fid.fid, + &ep->util_ep.rx_cq->cq_fid.fid, + FI_RECV); if (ret) return ret; } else { @@ -1330,6 +1306,8 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, dlist_init(&ep->sar_list); dlist_init(&ep->ipc_cpy_pend_list); + ep->min_multi_recv_size = SMR_INJECT_SIZE; + ep->util_ep.ep_fid.fid.ops = &smr_ep_fi_ops; ep->util_ep.ep_fid.ops = &smr_ep_ops; ep->util_ep.ep_fid.cm = &smr_cm_ops; diff --git a/prov/shm/src/smr_msg.c b/prov/shm/src/smr_msg.c index 641645ea5d3..0a34ae637ff 100644 --- a/prov/shm/src/smr_msg.c +++ b/prov/shm/src/smr_msg.c @@ -45,7 +45,7 @@ static ssize_t smr_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_recv(ep->srx, msg->msg_iov, msg->desc, + return util_srx_generic_recv(&ep->srx->ep_fid, msg->msg_iov, msg->desc, msg->iov_count, msg->addr, msg->context, flags | ep->util_ep.rx_msg_flags); } @@ -58,8 +58,8 @@ static ssize_t smr_recvv(struct fid_ep *ep_fid, const struct iovec *iov, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_recv(ep->srx, iov, desc, count, src_addr, - context, smr_ep_rx_flags(ep)); + return util_srx_generic_recv(&ep->srx->ep_fid, iov, desc, count, + src_addr, context, smr_ep_rx_flags(ep)); } static ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, @@ -73,8 +73,8 @@ static ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, iov.iov_base = buf; iov.iov_len = len; - return util_srx_generic_recv(ep->srx, &iov, &desc, 1, src_addr, context, - smr_ep_rx_flags(ep)); + return util_srx_generic_recv(&ep->srx->ep_fid, &iov, &desc, 1, src_addr, + context, smr_ep_rx_flags(ep)); } static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, @@ -293,8 +293,9 @@ static ssize_t smr_trecv(struct fid_ep *ep_fid, void *buf, size_t len, iov.iov_base = buf; iov.iov_len = len; - return util_srx_generic_trecv(ep->srx, &iov, &desc, 1, src_addr, context, - tag, ignore, smr_ep_rx_flags(ep)); + return util_srx_generic_trecv(&ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, tag, ignore, + smr_ep_rx_flags(ep)); } static ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, @@ -305,8 +306,9 @@ static ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_trecv(ep->srx, iov, desc, count, src_addr, - context, tag, ignore, smr_ep_rx_flags(ep)); + return util_srx_generic_trecv(&ep->srx->ep_fid, iov, desc, count, + src_addr, context, tag, ignore, + smr_ep_rx_flags(ep)); } static ssize_t smr_trecvmsg(struct fid_ep *ep_fid, @@ -316,10 +318,10 @@ static ssize_t smr_trecvmsg(struct fid_ep *ep_fid, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_trecv(ep->srx, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - msg->tag, msg->ignore, - flags | ep->util_ep.rx_msg_flags); + return util_srx_generic_trecv(&ep->srx->ep_fid, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, msg->context, + msg->tag, msg->ignore, + flags | ep->util_ep.rx_msg_flags); } static ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index 141826b9bba..3932e404c15 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -762,8 +762,8 @@ static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, if (!pend) { comp_buf = rx_entry->iov[0].iov_base; - comp_flags = smr_rx_cq_flags(cmd->msg.hdr.op, rx_entry->flags, - cmd->msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(rx_entry->flags, + cmd->msg.hdr.op_flags); if (err) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing op\n"); @@ -781,7 +781,7 @@ static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to process rx completion\n"); } - smr_get_peer_srx(ep)->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); } return 0; @@ -822,8 +822,8 @@ static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, } assert(!cmd_ctx->sar_entry); - comp_flags = smr_rx_cq_flags(cmd_ctx->cmd.msg.hdr.op, - rx_entry->flags, cmd_ctx->cmd.msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(rx_entry->flags, + cmd_ctx->cmd.msg.hdr.op_flags); ret = smr_complete_rx(cmd_ctx->ep, rx_entry->context, cmd_ctx->cmd.msg.hdr.op, comp_flags, @@ -836,7 +836,7 @@ static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, "unable to process rx completion\n"); return ret; } - smr_get_peer_srx(cmd_ctx->ep)->owner_ops->free_entry(rx_entry); + cmd_ctx->ep->srx->owner_ops->free_entry(rx_entry); return FI_SUCCESS; } @@ -878,7 +878,9 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) peer_smr = smr_peer_region(ep->region, idx); if (!peer_smr) { + ofi_spin_lock(&ep->region->map->lock); ret = smr_map_to_region(&smr_prov, ep->region->map, idx); + ofi_spin_unlock(&ep->region->map->lock); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Could not map peer region\n"); @@ -891,14 +893,11 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) if (peer_smr->pid != (int) cmd->msg.hdr.data) { /* TODO track and update/complete in error any transfers * to or from old mapping - * - * TODO create smr_unmap_region - * this needs to close peer_smr->map->peers[idx].pid_fd - * This case will also return an unmapped region because the idx - * is valid but the region was unmapped */ - munmap(peer_smr, peer_smr->total_size); + ofi_spin_lock(&ep->region->map->lock); + smr_unmap_region(&smr_prov, ep->region->map, idx, false); smr_map_to_region(&smr_prov, ep->region->map, idx); + ofi_spin_unlock(&ep->region->map->lock); peer_smr = smr_peer_region(ep->region, idx); } @@ -983,7 +982,6 @@ static int smr_alloc_cmd_ctx(struct smr_ep *ep, static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) { - struct fid_peer_srx *peer_srx = smr_get_peer_srx(ep); struct fi_peer_match_attr attr; struct fi_peer_rx_entry *rx_entry; int ret; @@ -992,33 +990,33 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) attr.msg_size = cmd->msg.hdr.size; attr.tag = cmd->msg.hdr.tag; if (cmd->msg.hdr.op == ofi_op_tagged) { - ret = peer_srx->owner_ops->get_tag(peer_srx, &attr, &rx_entry); + ret = ep->srx->owner_ops->get_tag(ep->srx, &attr, &rx_entry); if (ret == -FI_ENOENT) { ret = smr_alloc_cmd_ctx(ep, rx_entry, cmd); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } - ret = peer_srx->owner_ops->queue_tag(rx_entry); + ret = ep->srx->owner_ops->queue_tag(rx_entry); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } goto out; } } else { - ret = peer_srx->owner_ops->get_msg(peer_srx, &attr, &rx_entry); + ret = ep->srx->owner_ops->get_msg(ep->srx, &attr, &rx_entry); if (ret == -FI_ENOENT) { ret = smr_alloc_cmd_ctx(ep, rx_entry, cmd); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } - ret = peer_srx->owner_ops->queue_msg(rx_entry); + ret = ep->srx->owner_ops->queue_msg(rx_entry); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } goto out; @@ -1108,14 +1106,14 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing rma op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), 0, -err); + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + 0, -err); } else { ret = smr_complete_rx(ep, (void *) cmd->msg.hdr.msg_id, - cmd->msg.hdr.op, smr_rx_cq_flags(cmd->msg.hdr.op, - 0, cmd->msg.hdr.op_flags), total_len, - iov_count ? iov[0].iov_base : NULL, - cmd->msg.hdr.id, 0, cmd->msg.hdr.data); + cmd->msg.hdr.op, smr_rx_cq_flags(0, + cmd->msg.hdr.op_flags), total_len, + iov_count ? iov[0].iov_base : NULL, + cmd->msg.hdr.id, 0, cmd->msg.hdr.data); } if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -1193,13 +1191,12 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing atomic op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), 0, err); + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + 0, err); } else { ret = smr_complete_rx(ep, NULL, cmd->msg.hdr.op, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), total_len, - ioc_count ? ioc[0].addr : NULL, + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + total_len, ioc_count ? ioc[0].addr : NULL, cmd->msg.hdr.id, 0, cmd->msg.hdr.data); } if (ret) { @@ -1306,13 +1303,11 @@ void smr_progress_ipc_list(struct smr_ep *ep) if (ipc_entry->rx_entry) { context = ipc_entry->rx_entry->context; - flags = smr_rx_cq_flags(ipc_entry->cmd.msg.hdr.op, - ipc_entry->rx_entry->flags, + flags = smr_rx_cq_flags(ipc_entry->rx_entry->flags, ipc_entry->cmd.msg.hdr.op_flags); } else { context = NULL; - flags = smr_rx_cq_flags(ipc_entry->cmd.msg.hdr.op, - 0, ipc_entry->cmd.msg.hdr.op_flags); + flags = smr_rx_cq_flags(0, ipc_entry->cmd.msg.hdr.op_flags); } ret = smr_complete_rx(ep, context, ipc_entry->cmd.msg.hdr.op, @@ -1338,7 +1333,7 @@ void smr_progress_ipc_list(struct smr_ep *ep) ipc_entry->async_event); dlist_remove(&ipc_entry->entry); if (ipc_entry->rx_entry) - smr_get_peer_srx(ep)->owner_ops->free_entry(ipc_entry->rx_entry); + ep->srx->owner_ops->free_entry(ipc_entry->rx_entry); ofi_buf_free(ipc_entry); } } @@ -1424,13 +1419,13 @@ static void smr_progress_sar_list(struct smr_ep *ep) if (sar_entry->rx_entry) { comp_ctx = sar_entry->rx_entry->context; - comp_flags = smr_rx_cq_flags(sar_entry->cmd.msg.hdr.op, + comp_flags = smr_rx_cq_flags( sar_entry->rx_entry->flags, sar_entry->cmd.msg.hdr.op_flags); } else { comp_ctx = NULL; - comp_flags = smr_rx_cq_flags(sar_entry->cmd.msg.hdr.op, - 0, sar_entry->cmd.msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(0, + sar_entry->cmd.msg.hdr.op_flags); } ret = smr_complete_rx(ep, comp_ctx, sar_entry->cmd.msg.hdr.op, comp_flags, @@ -1444,7 +1439,7 @@ static void smr_progress_sar_list(struct smr_ep *ep) "unable to process rx completion\n"); } if (sar_entry->rx_entry) - smr_get_peer_srx(ep)->owner_ops->free_entry(sar_entry->rx_entry); + ep->srx->owner_ops->free_entry(sar_entry->rx_entry); dlist_remove(&sar_entry->entry); ofi_buf_free(sar_entry); diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c index 2924ddaa6f2..0372c7e0597 100644 --- a/prov/shm/src/smr_util.c +++ b/prov/shm/src/smr_util.c @@ -367,16 +367,15 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, } pthread_mutex_unlock(&ep_list_lock); - ofi_spin_lock(&map->lock); if (peer_buf->region) - goto unlock; + return FI_SUCCESS; + assert(ofi_spin_held(&map->lock)); fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); if (fd < 0) { - ret = -errno; FI_WARN_ONCE(prov, FI_LOG_AV, "shm_open error: name %s errno %d\n", name, errno); - goto unlock; + return -errno; } memset(tmp, 0, sizeof(tmp)); @@ -437,8 +436,6 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, out: close(fd); -unlock: - ofi_spin_unlock(&map->lock); return ret; } @@ -448,6 +445,7 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id) struct smr_region *peer_smr; struct smr_peer_data *local_peers; + assert(ofi_spin_held(®ion->map->lock)); peer_smr = smr_peer_region(region, id); if (region->map->peers[id].peer.id < 0 || !peer_smr) return; @@ -479,32 +477,81 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id) return; } +void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, + int64_t peer_id, bool local) +{ + struct smr_region *peer_region; + struct smr_peer *peer; + struct util_ep *util_ep; + struct smr_ep *smr_ep; + struct smr_av *av; + int ret = 0; + + assert(ofi_spin_held(&map->lock)); + peer_region = map->peers[peer_id].region; + if (!peer_region) + return; + + peer = &map->peers[peer_id]; + av = container_of(map, struct smr_av, smr_map); + dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, + av_entry) { + smr_ep = container_of(util_ep, struct smr_ep, util_ep); + smr_unmap_from_endpoint(smr_ep->region, peer_id); + } + + /* Don't unmap memory owned by this pid because the endpoint it belongs + * to might still be active. + */ + if (local) + return; + + if (map->flags & SMR_FLAG_HMEM_ENABLED) { + ret = ofi_hmem_host_unregister(peer_region); + if (ret) + FI_WARN(prov, FI_LOG_EP_CTRL, + "unable to unregister shm with iface\n"); + + if (peer->pid_fd != -1) { + close(peer->pid_fd); + peer->pid_fd = -1; + } + } + + munmap(peer_region, peer_region->total_size); + peer->region = NULL; +} + void smr_unmap_from_endpoint(struct smr_region *region, int64_t id) { struct smr_region *peer_smr; struct smr_peer_data *local_peers, *peer_peers; int64_t peer_id; - local_peers = smr_peer_data(region); if (region->map->peers[id].peer.id < 0) return; peer_smr = smr_peer_region(region, id); - peer_id = smr_peer_data(region)[id].addr.id; - + assert(peer_smr); peer_peers = smr_peer_data(peer_smr); + peer_id = smr_peer_data(region)[id].addr.id; peer_peers[peer_id].addr.id = -1; peer_peers[peer_id].name_sent = 0; + local_peers = smr_peer_data(region); ofi_xpmem_release(&local_peers[peer_id].xpmem); } void smr_exchange_all_peers(struct smr_region *region) { int64_t i; + + ofi_spin_lock(®ion->map->lock); for (i = 0; i < SMR_MAX_PEERS; i++) smr_map_to_endpoint(region, i); + + ofi_spin_unlock(®ion->map->lock); } int smr_map_add(const struct fi_provider *prov, struct smr_map *map, @@ -546,37 +593,24 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map, void smr_map_del(struct smr_map *map, int64_t id) { - struct dlist_entry *entry; + struct smr_ep_name *name; + bool local = false; assert(id >= 0 && id < SMR_MAX_PEERS); - pthread_mutex_lock(&ep_list_lock); - entry = dlist_find_first_match(&ep_name_list, smr_match_name, - smr_no_prefix(map->peers[id].peer.name)); + dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, entry) { + if (!strcmp(name->name, map->peers[id].peer.name)) { + local = true; + break; + } + } pthread_mutex_unlock(&ep_list_lock); - ofi_spin_lock(&map->lock); - (void) ofi_rbmap_find_delete(&map->rbmap, - (void *) map->peers[id].peer.name); - + smr_unmap_region(&smr_prov, map, id, local); map->peers[id].fiaddr = FI_ADDR_NOTAVAIL; map->peers[id].peer.id = -1; map->num_peers--; - - if (!map->peers[id].region) - goto unlock; - - if (!entry) { - if (map->flags & SMR_FLAG_HMEM_ENABLED) { - if (map->peers[id].pid_fd != -1) - close(map->peers[id].pid_fd); - - (void) ofi_hmem_host_unregister(map->peers[id].region); - } - munmap(map->peers[id].region, map->peers[id].region->total_size); - map->peers[id].region = NULL; - } -unlock: + ofi_rbmap_find_delete(&map->rbmap, map->peers[id].peer.name); ofi_spin_unlock(&map->lock); } diff --git a/prov/shm/src/smr_util.h b/prov/shm/src/smr_util.h index c5bf8124873..7ed4e1e426f 100644 --- a/prov/shm/src/smr_util.h +++ b/prov/shm/src/smr_util.h @@ -356,6 +356,8 @@ void smr_cleanup(void); int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, int64_t id); void smr_map_to_endpoint(struct smr_region *region, int64_t id); +void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, + int64_t id, bool found); void smr_unmap_from_endpoint(struct smr_region *region, int64_t id); void smr_exchange_all_peers(struct smr_region *region); int smr_map_add(const struct fi_provider *prov, struct smr_map *map, diff --git a/prov/sockets/src/sock_av.c b/prov/sockets/src/sock_av.c index 6e799a2a1d8..71003e2d8d6 100644 --- a/prov/sockets/src/sock_av.c +++ b/prov/sockets/src/sock_av.c @@ -123,8 +123,11 @@ static inline void sock_av_report_success(struct sock_av *av, void *context, eq_entry.fid = &av->av_fid.fid; eq_entry.context = context; eq_entry.data = num_done; - sock_eq_report_event(av->eq, FI_AV_COMPLETE, - &eq_entry, sizeof(eq_entry), flags); + if (sock_eq_report_event(av->eq, FI_AV_COMPLETE, + &eq_entry, sizeof(eq_entry), flags)) + SOCK_LOG_ERROR("Error in writing to EQ\n"); + + } static void sock_av_report_error(struct sock_av *av, fi_addr_t *fi_addr, diff --git a/prov/tcp/src/xnet_av.c b/prov/tcp/src/xnet_av.c index 7cf77604a58..80b18f2a568 100644 --- a/prov/tcp/src/xnet_av.c +++ b/prov/tcp/src/xnet_av.c @@ -38,7 +38,7 @@ int xnet_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) { return rxm_util_av_open(domain_fid, attr, fid_av, context, - sizeof(struct xnet_conn), NULL); + sizeof(struct xnet_conn), NULL, NULL); } static int xnet_mplex_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, @@ -69,23 +69,27 @@ static int xnet_mplex_av_insert(struct fid_av *av_fid, const void *addr, size_t int ret; struct fid_list_entry *item; struct fid_av *subav_fid; - fi_addr_t sub_fi_addr; + fi_addr_t *sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, - util_av.av_fid.fid); - + util_av.av_fid); + sub_fi_addr = calloc(count, sizeof(fi_addr_t)); + if (!sub_fi_addr) + return -FI_ENOMEM; ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insert(&av->util_av.av_fid, addr, count, fi_addr, flags, context); if (ret < count) goto out; - dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { + dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); - ret = fi_av_insert(subav_fid, addr, count, &sub_fi_addr, flags, context); + ret = fi_av_insert(subav_fid, addr, count, sub_fi_addr, flags, context); if (ret < count) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || memcmp(fi_addr, sub_fi_addr, + sizeof(fi_addr_t) * count) == 0); } out: ofi_genlock_unlock(&av->lock); + free(sub_fi_addr); return ret; } @@ -97,26 +101,29 @@ static int xnet_mplex_av_insertsym(struct fid_av *av_fid, const char *node, int ret; struct fid_list_entry *item; struct fid_av *subav_fid; - fi_addr_t sub_fi_addr; + fi_addr_t *sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, util_av.av_fid.fid); - + sub_fi_addr = calloc(nodecnt * svccnt, sizeof(fi_addr_t)); + if (!sub_fi_addr) + return -FI_ENOMEM; ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insertsym(&av->util_av.av_fid, node, nodecnt, service, svccnt, fi_addr, flags, context); - if (ret) + if (ret < nodecnt * svccnt) goto out; dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); ret = fi_av_insertsym(subav_fid, node, nodecnt, service, svccnt, - &sub_fi_addr, flags, context); - if (ret) + sub_fi_addr, flags, context); + if (ret <= nodecnt * svccnt) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || memcmp(fi_addr, sub_fi_addr, + sizeof(fi_addr_t) * nodecnt * svccnt) == 0); } out: ofi_genlock_unlock(&av->lock); - + free(sub_fi_addr); return ret; } @@ -130,19 +137,18 @@ static int xnet_mplex_av_insertsvc(struct fid_av *av_fid, const char *node, fi_addr_t sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, util_av.av_fid.fid); - ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insertsvc(&av->util_av.av_fid, node, service, fi_addr, flags, context); - if (ret) + if (ret <= 0) goto out; dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); ret = fi_av_insertsvc(subav_fid, node, service, &sub_fi_addr, flags, context); - if (ret) + if (ret <= 0) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || *fi_addr == sub_fi_addr); } out: ofi_genlock_unlock(&av->lock); diff --git a/prov/tcp/src/xnet_cq.c b/prov/tcp/src/xnet_cq.c index 03ea975371d..2090bdf7170 100644 --- a/prov/tcp/src/xnet_cq.c +++ b/prov/tcp/src/xnet_cq.c @@ -202,13 +202,15 @@ void xnet_report_error(struct xnet_xfer_entry *xfer_entry, int err) err_entry.flags = xfer_entry->cq_flags & ~FI_COMPLETION; if (err_entry.flags & FI_RECV) { - if (xfer_entry->ctrl_flags & XNET_MULTI_RECV && - xfer_entry->mrecv) { - xfer_entry->mrecv->ref_cnt--; - if (!xfer_entry->mrecv->ref_cnt) { + if (xfer_entry->ctrl_flags & XNET_MULTI_RECV) { + if (xfer_entry->mrecv) { + xfer_entry->mrecv->ref_cnt--; + if (!xfer_entry->mrecv->ref_cnt) { + err_entry.flags |= FI_MULTI_RECV; + free(xfer_entry->mrecv); + } + } else err_entry.flags |= FI_MULTI_RECV; - free(xfer_entry->mrecv); - } } xnet_get_cq_info(xfer_entry, &err_entry.flags, &err_entry.data, &err_entry.tag); diff --git a/prov/tcp/src/xnet_ep.c b/prov/tcp/src/xnet_ep.c index 64772fef0aa..0ff5723d9d2 100644 --- a/prov/tcp/src/xnet_ep.c +++ b/prov/tcp/src/xnet_ep.c @@ -677,7 +677,7 @@ static struct fi_ops xnet_ep_fi_ops = { .close = xnet_ep_close, .bind = xnet_ep_bind, .control = xnet_ep_ctrl, - .ops_open = fi_no_ops_open, + .ops_open = xnet_ep_ops_open, }; static int xnet_ep_getopt(fid_t fid, int level, int optname, @@ -828,7 +828,6 @@ int xnet_endpoint(struct fid_domain *domain, struct fi_info *info, (*ep_fid)->msg = &xnet_msg_ops; (*ep_fid)->rma = &xnet_rma_ops; (*ep_fid)->tagged = &xnet_tagged_ops; - (*ep_fid)->fid.ops->ops_open = xnet_ep_ops_open; return 0; err3: diff --git a/prov/tcp/src/xnet_progress.c b/prov/tcp/src/xnet_progress.c index aa76968e175..41eaf8ef944 100644 --- a/prov/tcp/src/xnet_progress.c +++ b/prov/tcp/src/xnet_progress.c @@ -1236,7 +1236,8 @@ static void xnet_uring_rx_done(struct xnet_ep *ep, int res) else xnet_complete_rx(ep, FI_SUCCESS); } - xnet_progress_rx(ep); + if (ep->state == XNET_CONNECTED) + xnet_progress_rx(ep); return; disable_ep: diff --git a/prov/tcp/src/xnet_rdm.c b/prov/tcp/src/xnet_rdm.c index 420f606f658..68f320cef4e 100644 --- a/prov/tcp/src/xnet_rdm.c +++ b/prov/tcp/src/xnet_rdm.c @@ -700,7 +700,7 @@ static int xnet_mplex_av_dup(struct util_ep *ep, struct xnet_mplex_av *mplex_av, { int ret, i; struct util_av *subav; - size_t addr_size; + size_t addr_size = sizeof(struct sockaddr_in6); char addr[sizeof(struct sockaddr_in6)]; struct fi_av_attr av_attr = { .type = ep->domain->av_type, diff --git a/prov/ucx/src/ucx_core.c b/prov/ucx/src/ucx_core.c index d1ffc947296..058770b29b9 100644 --- a/prov/ucx/src/ucx_core.c +++ b/prov/ucx/src/ucx_core.c @@ -94,6 +94,25 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, return ucx_translate_errcode(*(ucs_status_t*)status); } + if (UCS_PTR_STATUS(status) != UCS_OK) { + struct ucx_request *req = (struct ucx_request *)status; + + /* + * Set up the req fields before the callback function is called + * (in ucp_worker_progress or ucp_worker_flush). + */ + req->ep = u_ep; + if (!no_completion) { + req->completion.op_context = msg->context; + req->completion.flags = FI_SEND | + (mode == UCX_MSG ? FI_MSG : FI_TAGGED); + req->completion.len = msg->msg_iov[0].iov_len; + req->completion.buf = msg->msg_iov[0].iov_base; + req->completion.tag = msg->tag; + req->cq = cq; + } + } + if (flags & FI_INJECT) { if(UCS_PTR_STATUS(status) != UCS_OK) { while ((cstatus = ucp_request_check_status(status)) @@ -110,13 +129,6 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, goto done; } - if (no_completion) { - if (UCS_PTR_STATUS(status) != UCS_OK) - goto fence; - - goto done; - } - if (msg->context) { struct fi_context *ctx = ((struct fi_context*)(msg->context)); @@ -129,16 +141,6 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, * Not done yet. completion will be handled by the callback * function. */ - struct ucx_request *req = (struct ucx_request *)status; - - req->completion.op_context = msg->context; - req->completion.flags = FI_SEND | - (mode == UCX_MSG ? FI_MSG : FI_TAGGED); - req->completion.len = msg->msg_iov[0].iov_len; - req->completion.buf = msg->msg_iov[0].iov_base; - req->completion.tag = msg->tag; - req->ep = u_ep; - req->cq = cq; goto fence; } diff --git a/prov/ucx/src/ucx_domain.c b/prov/ucx/src/ucx_domain.c index f608ba66574..2e03bd4b8dc 100644 --- a/prov/ucx/src/ucx_domain.c +++ b/prov/ucx/src/ucx_domain.c @@ -368,7 +368,7 @@ int ucx_domain_open(struct fid_fabric *fabric, struct fi_info *info, return -ENOMEM; ofi_status = fi_param_get_size_t(NULL, "universe_size", &univ_size); - if (ofi_status) { + if (ofi_status == FI_SUCCESS) { params.estimated_num_eps = univ_size; params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; } diff --git a/prov/ucx/src/ucx_fabric.c b/prov/ucx/src/ucx_fabric.c index 106c7127db1..eba1263a9b7 100644 --- a/prov/ucx/src/ucx_fabric.c +++ b/prov/ucx/src/ucx_fabric.c @@ -87,7 +87,7 @@ static char* ucx_local_host_resolve() char *result = NULL; status = fi_param_get(&ucx_prov, "ns_iface", &iface); - if (!status) + if (status != FI_SUCCESS) iface = NULL; if (getifaddrs(&ifaddr) == -1) { diff --git a/prov/util/src/import_mem_monitor.c b/prov/util/src/import_mem_monitor.c new file mode 100644 index 00000000000..e7be581526f --- /dev/null +++ b/prov/util/src/import_mem_monitor.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2017 Cray Inc. All rights reserved. + * Copyright (c) 2017-2021 Intel Inc. All rights reserved. + * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP + * Copyright (C) 2024 Cornelis Networks. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor); +static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor); +static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor); +static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor); +static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry); + +struct ofi_import_monitor { + struct ofi_mem_monitor monitor; + struct fid_mem_monitor *impfid; +}; + +static struct ofi_import_monitor impmon = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_import_monitor_init, + .monitor.cleanup = ofi_import_monitor_cleanup, + .monitor.start = ofi_import_monitor_start, + .monitor.stop = ofi_import_monitor_stop, + .monitor.subscribe = ofi_import_monitor_subscribe, + .monitor.unsubscribe = ofi_import_monitor_unsubscribe, + .monitor.valid = ofi_import_monitor_valid, + .monitor.name = "import", +}; + +struct ofi_mem_monitor *import_monitor = &impmon.monitor; + +static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor) +{ + ofi_monitor_init(monitor); +} + +static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor) +{ + assert(!impmon.impfid); + ofi_monitor_cleanup(monitor); +} + +static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor) +{ + if (!impmon.impfid) + return -FI_ENOSYS; + + return impmon.impfid->export_ops->start(impmon.impfid); +} + +static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor) +{ + assert(impmon.impfid); + impmon.impfid->export_ops->stop(impmon.impfid); +} + +static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + assert(impmon.impfid); + return impmon.impfid->export_ops->subscribe(impmon.impfid, addr, len); +} + +static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + assert(impmon.impfid); + impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len); +} + +static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + assert(impmon.impfid); + return impmon.impfid->export_ops->valid(impmon.impfid, + entry->info.iov.iov_base, + entry->info.iov.iov_len); +} + +static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor, + const void *addr, size_t len) +{ + assert(monitor->fid.context == &impmon); + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ofi_monitor_notify(&impmon.monitor, addr, len); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); +} + +static int ofi_close_import(struct fid *fid) +{ + pthread_mutex_lock(&mm_state_lock); + impmon.monitor.state = FI_MM_STATE_IDLE; + pthread_mutex_unlock(&mm_state_lock); + impmon.impfid = NULL; + return 0; +} + +static struct fi_ops_mem_notify import_ops = { + .size = sizeof(struct fi_ops_mem_notify), + .notify = ofi_import_monitor_notify, +}; + +static struct fi_ops impfid_ops = { + .size = sizeof(struct fi_ops), + .close = ofi_close_import, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, + .tostr = fi_no_tostr, + .ops_set = fi_no_ops_set, +}; + +int ofi_monitor_import(struct fid *fid) +{ + struct fid_mem_monitor *impfid; + + if (fid->fclass != FI_CLASS_MEM_MONITOR) + return -FI_ENOSYS; + + if (impmon.impfid) { + FI_WARN(&core_prov, FI_LOG_MR, + "imported monitor already exists\n"); + return -FI_EBUSY; + } + + if (default_monitor && !dlist_empty(&default_monitor->list)) { + FI_WARN(&core_prov, FI_LOG_MR, + "cannot replace active monitor\n"); + return -FI_EBUSY; + } + + impfid = container_of(fid, struct fid_mem_monitor, fid); + if (impfid->export_ops->size < sizeof(struct fi_ops_mem_monitor)) + return -FI_EINVAL; + + impmon.impfid = impfid; + impfid->fid.context = &impmon; + impfid->fid.ops = &impfid_ops; + impfid->import_ops = &import_ops; + + FI_INFO(&core_prov, FI_LOG_MR, + "setting imported memory monitor as default\n"); + default_monitor = &impmon.monitor; + return 0; +} diff --git a/prov/util/src/kdreg2_mem_monitor.c b/prov/util/src/kdreg2_mem_monitor.c new file mode 100644 index 00000000000..ba7c2a21d31 --- /dev/null +++ b/prov/util/src/kdreg2_mem_monitor.c @@ -0,0 +1,367 @@ +/* + * (C) Copyright 2022-2023 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#if HAVE_KDREG2_MONITOR + +#include "ofi_hmem.h" + +#define EVICTOR_THREAD_ATTR NULL +#define INFINITE_TIMEOUT -1 + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + uint64_t cookie = ofi_atomic_inc64(&kdreg2->next_cookie); + struct kdreg2_ioctl_monitor ioctl_monitor = { + .addr = addr, + .length = len, + .cookie = (kdreg2_cookie_t) cookie, + }; + int ret; + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_MONITOR, &ioctl_monitor); + if (ret) + return ret; + + hmem_info->kdreg2.cookie = ioctl_monitor.cookie; + hmem_info->kdreg2.monitoring_params = ioctl_monitor.monitoring_params; + + return 0; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_ioctl_unmonitor ioctl_unmonitor = { + .cookie = hmem_info->kdreg2.cookie, + .monitoring_params = hmem_info->kdreg2.monitoring_params, + }; + + ioctl(kdreg2->fd, KDREG2_IOCTL_UNMONITOR, &ioctl_unmonitor); +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_monitoring_params *params = + &entry->hmem_info.kdreg2.monitoring_params; + + return !kdreg2_mapping_changed(kdreg2->status_data, params); +} + +static int kdreg2_read_evictions(struct ofi_kdreg2 *kdreg2) +{ + struct kdreg2_event event; + ssize_t bytes; + int err; + + while (kdreg2_read_counter(&kdreg2->status_data->pending_events) > 0) { + + /* The read should return a multiple of sizeof(event) or + * an error. There should be no partial reads. + */ + + bytes = read(kdreg2->fd, &event, sizeof(event)); + if (bytes < 0) { + err = errno; + + /* EINTR means we caught a signal. */ + if (err == EINTR) + continue; + + /* Nothing left */ + if ((err == EAGAIN) || + (err == EWOULDBLOCK)) + return 0; + + /* All other errors */ + return err; + } + + switch (event.type) { + case KDREG2_EVENT_MAPPING_CHANGE: + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + + ofi_monitor_notify(&kdreg2->monitor, + event.u.mapping_change.addr, + event.u.mapping_change.len); + + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + + break; + + default: + + return -ENOMSG; + } + } + + return 0; +} + +static void kdreg2_close_pipe(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->exit_pipe[0]); + close(kdreg2->exit_pipe[1]); + kdreg2->exit_pipe[0] = -1; + kdreg2->exit_pipe[1] = -1; +} + +static void kdreg2_close_fd(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->fd); + kdreg2->fd = -1; + kdreg2->status_data = NULL; +} + +static void *kdreg2_evictor(void *arg) +{ + struct ofi_kdreg2 *kdreg2 = (struct ofi_kdreg2 *) arg; + int ret; + struct pollfd pollfd[2] = { + { + .fd = kdreg2->fd, + .events = POLLIN, + }, + { .fd = kdreg2->exit_pipe[0], + .events = POLLIN, + }, + }; + int n; + + while (1) { + + /* wait until there are events to read */ + n = poll(pollfd, 2, INFINITE_TIMEOUT); + if (n == 0) /* timeout(?) */ + continue; + + if (n < 0) { + switch (errno) { + case EINTR: /* interrupted */ + continue; + default: + ret = -errno; + goto error_ret; + } + } + + /* look for exit message on second fd */ + if (pollfd[1].revents) { + ret = 0; + goto error_ret; + } + + ret = kdreg2_read_evictions(kdreg2); + if (ret) + goto error_ret; + } + +error_ret: + + return (void *) (intptr_t) ret; +} + + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + int ret = 0; + struct kdreg2_config_data config_data; + + /* see if already started */ + if (kdreg2->fd >= 0) + return 0; + + ofi_atomic_initialize64(&kdreg2->next_cookie, 1); + + ret = pipe(kdreg2->exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to create pipe for kdreg2: %s\n", + strerror(errno)); + return -errno; + } + + kdreg2->fd = open(KDREG2_DEVICE_NAME, O_RDWR); + if (kdreg2->fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to open %s for monitor kdreg2: %s.\n", + KDREG2_DEVICE_NAME, strerror(errno)); + ret = -errno; + goto close_pipe; + } + + /* configure the monitor with the maximum number of entries */ + + config_data.max_regions = cache_params.max_cnt; + if (!config_data.max_regions) { + ret = -FI_ENOSPC; + goto close_fd; + } + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_CONFIG_DATA, &config_data); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get module config data for kdreg2 monitor: %d.\n", + errno); + ret = -errno; + goto close_fd; + } + + /* Configuring the monitor allocates the status data. Save the address. */ + + kdreg2->status_data = config_data.status_data; + + ret = pthread_create(&kdreg2->thread, EVICTOR_THREAD_ATTR, + kdreg2_evictor, kdreg2); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to start thread for kdreg2 monitor: %d.\n", + ret); + goto close_fd; + } + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor started.\n"); + + return 0; + +close_fd: + + kdreg2_close_fd(kdreg2); + +close_pipe: + + kdreg2_close_pipe(kdreg2); + + FI_WARN(&core_prov, FI_LOG_MR, + "Kdreg2 memory monitor failed to start: %i.\n", ret); + + return ret; +} + +static void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + + /* see if it's really running */ + if (kdreg2->fd < 0) + return; + + num_written = write(kdreg2->exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to write to kdreg2 exit pipe: %s\n", + strerror(errno)); + /* We could call pthread cancel here. The thread + * has probably already exited. Cancelling would be + * benign. But calling join on an exited thread is + * also legal. + */ + } + + pthread_join(kdreg2->thread, NULL); + + kdreg2_close_fd(kdreg2); + kdreg2_close_pipe(kdreg2); + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor stopped.\n"); +} + +#else /* !HAVE_KDREG2_MONITOR */ + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, + size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + /* no-op */ +} + +#endif /* HAVE_KDREG2_MONITOR */ + +static struct ofi_kdreg2 kdreg2_mm = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = kdreg2_monitor_start, + .monitor.stop = kdreg2_monitor_stop, + .monitor.subscribe = kdreg2_monitor_subscribe, + .monitor.unsubscribe = kdreg2_monitor_unsubscribe, + .monitor.valid = kdreg2_monitor_valid, + .monitor.name = "kdreg2", + .fd = -1, + .exit_pipe = { -1, -1 }, + .status_data = NULL, +}; + +struct ofi_mem_monitor *kdreg2_monitor = &kdreg2_mm.monitor; diff --git a/prov/util/src/rocr_mem_monitor.c b/prov/util/src/rocr_mem_monitor.c index c194814c640..1e78c1ac3c3 100644 --- a/prov/util/src/rocr_mem_monitor.c +++ b/prov/util/src/rocr_mem_monitor.c @@ -44,6 +44,7 @@ struct rocr_mm_entry { struct iovec iov; struct ofi_rbnode *node; + struct dlist_entry entry; }; struct rocr_mm { @@ -57,11 +58,17 @@ static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info); static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); + union ofi_mr_hmem_info *hmem_info, + struct dlist_entry *free_list); static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const struct ofi_mr_info *info, struct ofi_mr_entry *entry); +/* Since ROCR may have many MR cache entries for the same VA range and + * ofi_monitor_unsubscribe() is called for every MR cache entry being freed, + * ROCR unsubscribe needs to be a noop. Else, MR cache entries may no longer + * be monitored. + */ static struct rocr_mm rocr_mm = { .mm = { .iface = FI_HMEM_ROCR, @@ -70,7 +77,7 @@ static struct rocr_mm rocr_mm = { .start = rocr_mm_start, .stop = rocr_mm_stop, .subscribe = rocr_mm_subscribe, - .unsubscribe = rocr_mm_unsubscribe, + .unsubscribe = ofi_monitor_unsubscribe_no_op, .valid = rocr_mm_valid, .name = "rocr", }, @@ -133,15 +140,22 @@ static struct rocr_mm_entry *rocr_mm_entry_find(const void *addr) static void rocr_mm_dealloc_cb(void *addr, void *user_data) { size_t len = (size_t) user_data; + DEFINE_LIST(free_list); + struct rocr_mm_entry *entry; pthread_rwlock_rdlock(&mm_list_rwlock); pthread_mutex_lock(&mm_lock); - ofi_monitor_unsubscribe(rocr_monitor, addr, len, NULL); + rocr_mm_unsubscribe(rocr_monitor, addr, len, NULL, &free_list); pthread_mutex_unlock(&mm_lock); pthread_rwlock_unlock(&mm_list_rwlock); + + while (!dlist_empty(&free_list)) { + dlist_pop_front(&free_list, struct rocr_mm_entry, entry, entry); + free(entry); + } } -static void rocr_mm_entry_free(struct rocr_mm_entry *entry) +static void rocr_mm_entry_delete(struct rocr_mm_entry *entry) { hsa_status_t hsa_ret __attribute__((unused)); @@ -161,6 +175,11 @@ static void rocr_mm_entry_free(struct rocr_mm_entry *entry) hsa_ret == HSA_STATUS_ERROR_INVALID_ARGUMENT); ofi_rbmap_delete(rocr_mm.dev_region_tree, entry->node); +} + +static void rocr_mm_entry_free(struct rocr_mm_entry *entry) +{ + rocr_mm_entry_delete(entry); free(entry); } @@ -262,7 +281,8 @@ static void rocr_mm_stop(struct ofi_mem_monitor *monitor) static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) + union ofi_mr_hmem_info *hmem_info, + struct dlist_entry *free_list) { struct rocr_mm_entry *entry; size_t cur_len = len; @@ -286,7 +306,14 @@ static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1); - rocr_mm_entry_free(entry); + /* Since unsubscribed is called with mm_lock held, calling free + * may result in deadlocks if memhooks is used. To prevent this, + * entries are placed on a list to be freed later. + * + * Entry still needs to be deleted. + */ + rocr_mm_entry_delete(entry); + dlist_insert_tail(&entry->entry, free_list); cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr, cur_len); @@ -381,12 +408,6 @@ static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, return -FI_ENOSYS; } -static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ -} - static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const struct ofi_mr_info *info, struct ofi_mr_entry *entry) @@ -401,7 +422,7 @@ static struct ofi_mem_monitor rocr_mm = { .start = rocr_mm_start, .stop = rocr_mm_stop, .subscribe = rocr_mm_subscribe, - .unsubscribe = rocr_mm_unsubscribe, + .unsubscribe = ofi_monitor_unsubscribe_no_op, .valid = rocr_mm_valid, .name = "rocr", }; diff --git a/prov/util/src/rxm_av.c b/prov/util/src/rxm_av.c index 69a68a884db..beb11d0620c 100644 --- a/prov/util/src/rxm_av.c +++ b/prov/util/src/rxm_av.c @@ -165,7 +165,7 @@ rxm_put_peer_addr(struct rxm_av *av, fi_addr_t fi_addr) static int rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, - fi_addr_t *fi_addr) + fi_addr_t *fi_addr, fi_addr_t *user_ids) { struct util_peer_addr *peer; const void *cur_addr; @@ -178,8 +178,12 @@ rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, if (!peer) goto err; - peer->fi_addr = fi_addr ? fi_addr[i] : + if (user_ids) { + peer->fi_addr = user_ids[i]; + } else { + peer->fi_addr = fi_addr ? fi_addr[i] : ofi_av_lookup_fi_addr(&av->util_av, cur_addr); + } /* lookup can fail if prior AV insertion failed */ if (peer->fi_addr != FI_ADDR_NOTAVAIL) @@ -276,21 +280,43 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct rxm_av *av; + fi_addr_t *user_ids = NULL; + struct dlist_entry *av_entry; + struct util_ep *util_ep; int ret; + if (flags & FI_AV_USER_ID) { + assert(fi_addr); + user_ids = calloc(count, sizeof(*user_ids)); + assert(user_ids); + memcpy(user_ids, fi_addr, sizeof(*fi_addr) * count); + } + av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid); ret = ofi_ip_av_insert(av_fid, addr, count, fi_addr, flags, context); if (ret < 0) - return ret; + goto out; count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, user_ids); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); - return ret; + goto out; } + if (!av->foreach_ep) + goto out; + + dlist_foreach(&av->util_av.ep_list, av_entry) { + util_ep = container_of(av_entry, struct util_ep, av_entry); + av->foreach_ep(&av->util_av, util_ep); + } + +out: + free(user_ids); + if (ret) + return ret; return (int) count; } @@ -319,7 +345,7 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, if (ret > 0 && ret < count) count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, NULL); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); return ret; @@ -404,7 +430,9 @@ static struct fi_ops_av rxm_av_ops = { int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)) + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, struct util_ep *ep)) + { struct util_domain *domain; struct util_av_attr util_attr; @@ -441,6 +469,7 @@ int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, av->util_av.av_fid.fid.ops = &rxm_av_fi_ops; av->util_av.av_fid.ops = &rxm_av_ops; av->util_av.remove_handler = remove_handler; + av->foreach_ep = foreach_ep; *fid_av = &av->util_av.av_fid; return 0; diff --git a/prov/util/src/uffd_mem_monitor.c b/prov/util/src/uffd_mem_monitor.c new file mode 100644 index 00000000000..7172e1e7441 --- /dev/null +++ b/prov/util/src/uffd_mem_monitor.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2017 Cray Inc. All rights reserved. + * Copyright (c) 2017-2021 Intel Inc. All rights reserved. + * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP + * Copyright (C) 2024 Cornelis Networks. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include + +#ifndef UFFD_USER_MODE_ONLY +#define UFFD_USER_MODE_ONLY 0 +#endif + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor); +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry); + +static struct ofi_uffd uffd = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = ofi_uffd_start, + .monitor.stop = ofi_uffd_stop, + .monitor.subscribe = ofi_uffd_subscribe, + + /* Since UFFD may have many MR cache entries for the same VA range and + * ofi_monitor_unsubscribe() is called for every MR cache entry being + * freed, UFFD unsubscribe needs to be a noop. Else, MR cache entries + * may no longer be monitored. + */ + .monitor.unsubscribe = ofi_monitor_unsubscribe_no_op, + .monitor.valid = ofi_uffd_valid, + .monitor.name = "uffd", + .fd = -1, + .exit_pipe = { -1, -1 }, +}; +struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; + +#if HAVE_UFFD_MONITOR + +#include +#include +#include +#include + +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); +static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); + +/* The userfault fd monitor requires for events that could + * trigger it to be handled outside of the monitor functions + * itself. When a fault occurs on a monitored region, the + * faulting thread is put to sleep until the event is read + * via the userfault file descriptor. If this fault occurs + * within the userfault handling thread, no threads will + * read this event and our threads cannot progress, resulting + * in a hang. + */ +static void *ofi_uffd_handler(void *arg) +{ + struct uffd_msg msg; + struct pollfd fds[2]; + int ret; + + fds[0].fd = uffd.fd; + fds[0].events = POLLIN; + fds[1].fd = uffd.exit_pipe[0]; + fds[1].events = POLLIN; + + for (;;) { + ret = poll(fds, 2, -1); + if (ret < 0 || fds[1].revents) + break; + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ret = read(uffd.fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + if (errno != EAGAIN) + break; + continue; + } + + FI_DBG(&core_prov, FI_LOG_MR, "Received UFFD event %d\n", msg.event); + + switch (msg.event) { + case UFFD_EVENT_REMOVE: + ofi_uffd_unsubscribe(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remove.start, + (size_t) (msg.arg.remove.end - + msg.arg.remove.start), NULL); + /* fall through */ + case UFFD_EVENT_UNMAP: + ofi_monitor_notify(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remove.start, + (size_t) (msg.arg.remove.end - + msg.arg.remove.start)); + break; + case UFFD_EVENT_REMAP: + ofi_monitor_notify(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remap.from, + (size_t) msg.arg.remap.len); + break; + case UFFD_EVENT_PAGEFAULT: + ofi_uffd_pagefault_handler(&msg); + break; + default: + FI_WARN(&core_prov, FI_LOG_MR, + "Unhandled uffd event %d\n", msg.event); + break; + } + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + } + return NULL; +} + +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg) +{ + struct uffdio_zeropage zp; + int i; + int ret; + void * const address = (void *) (uintptr_t) msg->arg.pagefault.address; + uint64_t const flags = (uint64_t) msg->arg.pagefault.flags; +#if HAVE_UFFD_THREAD_ID + uint32_t const ptid = (uint32_t) msg->arg.pagefault.feat.ptid; +#endif + /* ofi_uffd_register sets the mode to + * UFFDIO_REGISTER_MODE_MISSING. As a result, we can + * get read, write or write-protect notifications via + * UFFD_EVENT_PAGEFAULT. The only ones we can sensibly + * handle are writes to non-backed pages. + * (Read and write-protect notifications are likely + * application bugs.) + */ + + if (flags != UFFD_PAGEFAULT_FLAG_WRITE) { +#if HAVE_UFFD_THREAD_ID + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p, thread %u\n", + flags, address, ptid); +#else + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p\n", + flags, address); +#endif + /* The faulting thread is halted at this point. In + * theory we could wake it up with UFFDIO_WAKE. In + * practice that requires the address range of the + * fault, information we don't have from the + * pagefault event. + */ + + return; + } + + /* The event tells us the address of the fault + * (which can be anywhere on the page). It does not + * tell us the size of the page so we have to guess + * from the list of known page_sizes. + * + * We employ the standard resolution: install a zeroed page. + */ + + for (i = 0; i < num_page_sizes; ) { + /* setup a zeropage reqest for this pagesize */ + zp.range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(address, page_sizes[i]); + zp.range.len = (uint64_t) page_sizes[i]; + zp.mode = 0; + zp.zeropage = 0; + + ret = ioctl(uffd.fd, UFFDIO_ZEROPAGE, &zp); + + if (ret == 0) /* success */ + return; + + /* Note: the documentation (man ioctl_userfaultfd) says + * that the ioctl() returns -1 on error and errno is set + * to indicate the error. It also says that the zeropage + * member of struct uffdio_zeropage is set to the negated + * error. The unit tests for uffd say + * real retval in uffdio_zeropage.zeropage + * so that's what we use here. + */ + + if (zp.zeropage == -EAGAIN) + /* This is a tough case. If the memory map is + * changing, the kernel returns EAGAIN before + * installing the zeroed page. So the page + * fault has not been rectified. If we don't try + * again, the application will crash. If we add + * a maximum retry count we could still end up + * with an unresolved page fault. + * + * It's likely a kernel bug or (something else + * bad like OOM) if it returns EAGAIN forever. + * So we retry until we get something besides + * EAGAIN. + */ + continue; /* retry this page size */ + + i++; /* try next page size */ + + if (zp.zeropage == -EINVAL) /* wrong page size */ + continue; + + /* If we get here we failed to install the zeroed + * page for this page size and it wasn't a size error. + * We could either stop trying or go on to the + * next pagesize. We choose to print a message and try + * another page size. + */ + + FI_DBG(&core_prov, FI_LOG_MR, + "Unable to install zeroed page of size %zu to handle page fault." + " address = %p zeropage = %lld errno = %d\n", + page_sizes[i], address, zp.zeropage, errno); + } + + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to handle event UFFD_EVENT_PAGEFAULT for address %p.\n", + address); +} + +static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) +{ + struct uffdio_register reg; + int ret; + + reg.range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(addr, page_size); + reg.range.len = ofi_get_page_bytes(addr, len, page_size); + reg.mode = UFFDIO_REGISTER_MODE_MISSING; + ret = ioctl(uffd.fd, UFFDIO_REGISTER, ®); + if (ret < 0) { + if (errno != EINVAL) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffd_reg: %s\n", strerror(errno)); + } + return -errno; + } + return 0; +} + +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + int i; + + assert(monitor == &uffd.monitor); + for (i = 0; i < num_page_sizes; i++) { + if (!ofi_uffd_register(addr, len, page_sizes[i])) + return 0; + } + return -FI_EFAULT; +} + +static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size) +{ + struct uffdio_range range; + int ret; + + range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(addr, page_size); + range.len = ofi_get_page_bytes(addr, len, page_size); + ret = ioctl(uffd.fd, UFFDIO_UNREGISTER, &range); + if (ret < 0) { + if (errno != EINVAL) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffd_unreg: %s\n", strerror(errno)); + } + return -errno; + } + return 0; +} + +/* May be called from mr cache notifier callback */ +static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + int i; + + assert(monitor == &uffd.monitor); + for (i = 0; i < num_page_sizes; i++) { + if (!ofi_uffd_unregister(addr, len, page_sizes[i])) + break; + } +} + +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + /* no-op */ + return true; +} + +static void ofi_uffd_close_fd(struct ofi_uffd *monitor) +{ + close(monitor->fd); + monitor->fd = -1; +} + +static void ofi_uffd_close_pipe(struct ofi_uffd *monitor) +{ + close(monitor->exit_pipe[0]); + close(monitor->exit_pipe[1]); + monitor->exit_pipe[0] = -1; + monitor->exit_pipe[1] = -1; +} + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) +{ + struct uffdio_api api; + int ret; + + if (uffd.fd >= 0) + return 0; + + if (!num_page_sizes) + return -FI_ENODATA; + + ret = pipe(uffd.exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/pipe: %s\n", strerror(errno)); + return -errno; + } + + uffd.fd = syscall(__NR_userfaultfd, + O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); + if (uffd.fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "syscall/userfaultfd %s\n", strerror(errno)); + ret = -errno; + goto close_pipe; + } + + api.api = UFFD_API; + api.features = UFFD_FEATURE_EVENT_UNMAP | UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_EVENT_REMAP; + ret = ioctl(uffd.fd, UFFDIO_API, &api); + if (ret < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffdio: %s\n", strerror(errno)); + ret = -errno; + goto close_fd; + } + + if (api.api != UFFD_API) { + FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); + ret = -FI_ENOSYS; + goto close_fd; + } + + ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "failed to create handler thread %s\n", strerror(ret)); + ret = -ret; + goto close_fd; + } + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd started.\n"); + + return 0; + +close_fd: + + ofi_uffd_close_fd(&uffd); + +close_pipe: + + ofi_uffd_close_pipe(&uffd); + + FI_WARN(&core_prov, FI_LOG_MR, + "Memory monitor uffd failed to start: %s.\n", + strerror(-ret)); + + return ret; +} + +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + + if (uffd.fd < 0) + return; + + /* tell the thread to exit with the exit_pipe */ + + num_written = write(uffd.exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/close: unable to write to exit pipe: %s", + strerror(errno)); + } + + pthread_join(uffd.thread, NULL); + + ofi_uffd_close_fd(&uffd); + ofi_uffd_close_pipe(&uffd); + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd stopped.\n"); +} + +#else /* HAVE_UFFD_MONITOR */ + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) +{ +} + +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + +#endif /* HAVE_UFFD_MONITOR */ diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 634af1e5e82..ffe1bb87b5f 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -93,7 +93,8 @@ char *ofi_strdup_tail(const char *str) } */ -char *ofi_strdup_append(const char *head, const char *tail) +static char *ofi_strdup_append_internal(const char *head, const char *tail, + char delim) { char *str; size_t len; @@ -101,10 +102,20 @@ char *ofi_strdup_append(const char *head, const char *tail) len = strlen(head) + strlen(tail) + 2; str = malloc(len); if (str) - sprintf(str, "%s%c%s", head, OFI_NAME_DELIM, tail); + sprintf(str, "%s%c%s", head, delim, tail); return str; } +char *ofi_strdup_link_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_LNX_DELIM); +} + +char *ofi_strdup_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_DELIM); +} + int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name) { char *exclude, *name, *temp; diff --git a/prov/util/src/util_av.c b/prov/util/src/util_av.c index 16ebb595ce0..5594dd3debc 100644 --- a/prov/util/src/util_av.c +++ b/prov/util/src/util_av.c @@ -276,14 +276,13 @@ int ofi_av_insert_addr_at(struct util_av *av, const void *addr, fi_addr_t fi_add struct util_av_entry *entry = NULL; assert(ofi_mutex_held(&av->lock)); - ofi_straddr_log(av->prov, FI_LOG_INFO, FI_LOG_AV, "inserting addr", addr); + ofi_av_straddr_log(av, FI_LOG_INFO, "inserting addr", addr); HASH_FIND(hh, av->hash, addr, av->addrlen, entry); if (entry) { if (fi_addr == ofi_buf_index(entry)) return FI_SUCCESS; - ofi_straddr_log(av->prov, FI_LOG_WARN, FI_LOG_AV, - "addr already in AV", addr); + ofi_av_straddr_log(av, FI_LOG_WARN, "addr already in AV", addr); return -FI_EALREADY; } @@ -304,14 +303,13 @@ int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr) struct util_av_entry *entry = NULL; assert(ofi_mutex_held(&av->lock)); - ofi_straddr_log(av->prov, FI_LOG_INFO, FI_LOG_AV, "inserting addr", addr); + ofi_av_straddr_log(av, FI_LOG_INFO, "inserting addr", addr); HASH_FIND(hh, av->hash, addr, av->addrlen, entry); if (entry) { if (fi_addr) *fi_addr = ofi_buf_index(entry); if (ofi_atomic_inc32(&entry->use_cnt) > 1) { - ofi_straddr_log(av->prov, FI_LOG_WARN, FI_LOG_AV, - "addr already in AV", addr); + ofi_av_straddr_log(av, FI_LOG_WARN, "addr already in AV", addr); } } else { entry = ofi_ibuf_alloc(av->av_entry_pool); diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index d1c980bf94b..27caaf66c77 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -3,7 +3,7 @@ * Copyright (c) 2017-2021 Intel Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP * Copyright (C) 2024 Cornelis Networks. All rights reserved. * * This software is available to you under a choice of one of two @@ -39,28 +39,11 @@ #include #include -#include -#include -#include - pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t mm_state_lock = PTHREAD_MUTEX_INITIALIZER; pthread_rwlock_t mm_list_rwlock = PTHREAD_RWLOCK_INITIALIZER; -static int ofi_uffd_start(struct ofi_mem_monitor *monitor); -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); - -static struct ofi_uffd uffd = { - .monitor.iface = FI_HMEM_SYSTEM, - .monitor.init = ofi_monitor_init, - .monitor.cleanup = ofi_monitor_cleanup, - .monitor.start = ofi_uffd_start, - .monitor.stop = ofi_uffd_stop, - .monitor.name = "uffd", -}; -struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; - struct ofi_mem_monitor *default_monitor; struct ofi_mem_monitor *default_cuda_monitor; struct ofi_mem_monitor *default_rocr_monitor; @@ -191,6 +174,7 @@ static void initialize_monitor_list() ze_monitor, ze_ipc_monitor, import_monitor, + kdreg2_monitor, }; monitor_list_size = ARRAY_SIZE(monitors); @@ -208,6 +192,37 @@ static void cleanup_monitor_list() { monitor_list_size = 0; } +static void set_default_monitor(const char *monitor) +{ + if (!monitor) + return; + + if (!strcmp(monitor, "userfaultfd") || !strcmp(monitor, "uffd")) { +#if HAVE_UFFD_MONITOR + default_monitor = uffd_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "memhooks")) { +#if HAVE_MEMHOOKS_MONITOR + default_monitor = memhooks_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "kdreg2")) { +#if HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "kdreg2 monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "disabled")) { + default_monitor = NULL; + } +} + /* * Initialize all available memory monitors */ @@ -242,11 +257,17 @@ void ofi_monitors_init(void) "Define a default memory registration monitor." " The monitor checks for virtual to physical memory" " address changes. Options are: userfaultfd, memhooks" - " and disabled. Userfaultfd is a Linux kernel feature." - " Memhooks operates by intercepting memory allocation" - " and free calls. Userfaultfd is the default if" - " available on the system. 'disabled' option disables" - " memory caching."); + " kdreg2, and disabled. Userfaultfd is a Linux kernel" + " feature. Memhooks operates by intercepting memory" + " allocation and free calls. kdreg2 is a supplied as a" + " loadable Linux kernel module." +#if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) + " " HAVE_MR_CACHE_MONITOR_DEFAULT +#else + " Userfaultfd" +#endif + " is the default if available on the system. 'disabled'" + " option disables memory caching."); fi_param_define(NULL, "mr_cuda_cache_monitor_enabled", FI_PARAM_BOOL, "Enable or disable the CUDA cache memory monitor." "Enabled by default."); @@ -275,34 +296,21 @@ void ofi_monitors_init(void) * do not override */ if (!default_monitor) { -#if HAVE_MEMHOOKS_MONITOR +#if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) + set_default_monitor(HAVE_MR_CACHE_MONITOR_DEFAULT); +#elif HAVE_MEMHOOKS_MONITOR default_monitor = memhooks_monitor; #elif HAVE_UFFD_MONITOR default_monitor = uffd_monitor; +#elif HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; #else default_monitor = NULL; #endif } - if (cache_params.monitor != NULL) { - if (!strcmp(cache_params.monitor, "userfaultfd")) { -#if HAVE_UFFD_MONITOR - default_monitor = uffd_monitor; -#else - FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n"); - default_monitor = NULL; -#endif - } else if (!strcmp(cache_params.monitor, "memhooks")) { -#if HAVE_MEMHOOKS_MONITOR - default_monitor = memhooks_monitor; -#else - FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); - default_monitor = NULL; -#endif - } else if (!strcmp(cache_params.monitor, "disabled")) { - default_monitor = NULL; - } - } + if (cache_params.monitor != NULL) + set_default_monitor(cache_params.monitor); FI_INFO(&core_prov, FI_LOG_MR, "Default memory monitor is: %s\n", @@ -496,6 +504,7 @@ void ofi_monitor_flush(struct ofi_mem_monitor *monitor) } } +/* For each new cached MR cache entry, subscribed is called. */ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info) @@ -514,6 +523,13 @@ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, return ret; } +/* For each cached MR entry freed, unsubscribe is called. + + * If a memory monitor does not have a context per subscribe (e.g., a single + * monitored region servering multiple MRs), the memory monitor must implement + * unsubscribe as a noop. This may result in extra notification events, but is + * harmless to correct operation. + */ void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info) @@ -544,371 +560,3 @@ void ofi_monitor_unsubscribe_no_op(struct ofi_mem_monitor *notifier, union ofi_mr_hmem_info *hmem_info) { } - -#if HAVE_UFFD_MONITOR - -#include -#include -#include -#include - -/* The userfault fd monitor requires for events that could - * trigger it to be handled outside of the monitor functions - * itself. When a fault occurs on a monitored region, the - * faulting thread is put to sleep until the event is read - * via the userfault file descriptor. If this fault occurs - * within the userfault handling thread, no threads will - * read this event and our threads cannot progress, resulting - * in a hang. - */ -static void *ofi_uffd_handler(void *arg) -{ - struct uffd_msg msg; - struct pollfd fds; - int ret; - - fds.fd = uffd.fd; - fds.events = POLLIN; - for (;;) { - ret = poll(&fds, 1, -1); - if (ret != 1) - break; - - pthread_rwlock_rdlock(&mm_list_rwlock); - pthread_mutex_lock(&mm_lock); - ret = read(uffd.fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); - if (errno != EAGAIN) - break; - continue; - } - - switch (msg.event) { - case UFFD_EVENT_REMOVE: - ofi_monitor_unsubscribe(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remove.start, - (size_t) (msg.arg.remove.end - - msg.arg.remove.start), NULL); - /* fall through */ - case UFFD_EVENT_UNMAP: - ofi_monitor_notify(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remove.start, - (size_t) (msg.arg.remove.end - - msg.arg.remove.start)); - break; - case UFFD_EVENT_REMAP: - ofi_monitor_notify(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remap.from, - (size_t) msg.arg.remap.len); - break; - default: - FI_WARN(&core_prov, FI_LOG_MR, - "Unhandled uffd event %d\n", msg.event); - break; - } - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); - } - return NULL; -} - -static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) -{ - struct uffdio_register reg; - int ret; - - reg.range.start = (uint64_t) (uintptr_t) - ofi_get_page_start(addr, page_size); - reg.range.len = ofi_get_page_bytes(addr, len, page_size); - reg.mode = UFFDIO_REGISTER_MODE_MISSING; - ret = ioctl(uffd.fd, UFFDIO_REGISTER, ®); - if (ret < 0) { - if (errno != EINVAL) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffd_reg: %s\n", strerror(errno)); - } - return -errno; - } - return 0; -} - -static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - int i; - - assert(monitor == &uffd.monitor); - for (i = 0; i < num_page_sizes; i++) { - if (!ofi_uffd_register(addr, len, page_sizes[i])) - return 0; - } - return -FI_EFAULT; -} - -static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size) -{ - struct uffdio_range range; - int ret; - - range.start = (uint64_t) (uintptr_t) - ofi_get_page_start(addr, page_size); - range.len = ofi_get_page_bytes(addr, len, page_size); - ret = ioctl(uffd.fd, UFFDIO_UNREGISTER, &range); - if (ret < 0) { - if (errno != EINVAL) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffd_unreg: %s\n", strerror(errno)); - } - return -errno; - } - return 0; -} - -/* May be called from mr cache notifier callback */ -static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - int i; - - assert(monitor == &uffd.monitor); - for (i = 0; i < num_page_sizes; i++) { - if (!ofi_uffd_unregister(addr, len, page_sizes[i])) - break; - } -} - -static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry) -{ - /* no-op */ - return true; -} - -static int ofi_uffd_start(struct ofi_mem_monitor *monitor) -{ - struct uffdio_api api; - int ret; - - uffd.monitor.subscribe = ofi_uffd_subscribe; - uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; - uffd.monitor.valid = ofi_uffd_valid; - - if (!num_page_sizes) - return -FI_ENODATA; - - uffd.fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd.fd < 0) { - FI_WARN(&core_prov, FI_LOG_MR, - "syscall/userfaultfd %s\n", strerror(errno)); - return -errno; - } - - api.api = UFFD_API; - api.features = UFFD_FEATURE_EVENT_UNMAP | UFFD_FEATURE_EVENT_REMOVE | - UFFD_FEATURE_EVENT_REMAP; - ret = ioctl(uffd.fd, UFFDIO_API, &api); - if (ret < 0) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffdio: %s\n", strerror(errno)); - ret = -errno; - goto closefd; - } - - if (api.api != UFFD_API) { - FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); - ret = -FI_ENOSYS; - goto closefd; - } - - ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); - if (ret) { - FI_WARN(&core_prov, FI_LOG_MR, - "failed to create handler thread %s\n", strerror(ret)); - ret = -ret; - goto closefd; - } - return 0; - -closefd: - close(uffd.fd); - return ret; -} - -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) -{ - pthread_cancel(uffd.thread); - pthread_join(uffd.thread, NULL); - close(uffd.fd); -} - -#else /* HAVE_UFFD_MONITOR */ - -static int ofi_uffd_start(struct ofi_mem_monitor *monitor) -{ - return -FI_ENOSYS; -} - -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) -{ -} - -#endif /* HAVE_UFFD_MONITOR */ - - -static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor); -static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor); -static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor); -static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor); -static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); -static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); -static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry); - -struct ofi_import_monitor { - struct ofi_mem_monitor monitor; - struct fid_mem_monitor *impfid; -}; - -static struct ofi_import_monitor impmon = { - .monitor.iface = FI_HMEM_SYSTEM, - .monitor.init = ofi_import_monitor_init, - .monitor.cleanup = ofi_import_monitor_cleanup, - .monitor.start = ofi_import_monitor_start, - .monitor.stop = ofi_import_monitor_stop, - .monitor.subscribe = ofi_import_monitor_subscribe, - .monitor.unsubscribe = ofi_import_monitor_unsubscribe, - .monitor.valid = ofi_import_monitor_valid, - .monitor.name = "import", -}; - -struct ofi_mem_monitor *import_monitor = &impmon.monitor; - -static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor) -{ - ofi_monitor_init(monitor); -} - -static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor) -{ - assert(!impmon.impfid); - ofi_monitor_cleanup(monitor); -} - -static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor) -{ - if (!impmon.impfid) - return -FI_ENOSYS; - - return impmon.impfid->export_ops->start(impmon.impfid); -} - -static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor) -{ - assert(impmon.impfid); - impmon.impfid->export_ops->stop(impmon.impfid); -} - -static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - assert(impmon.impfid); - return impmon.impfid->export_ops->subscribe(impmon.impfid, addr, len); -} - -static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - assert(impmon.impfid); - impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len); -} - -static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry) -{ - assert(impmon.impfid); - return impmon.impfid->export_ops->valid(impmon.impfid, - entry->info.iov.iov_base, - entry->info.iov.iov_len); -} - -static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor, - const void *addr, size_t len) -{ - assert(monitor->fid.context == &impmon); - pthread_rwlock_rdlock(&mm_list_rwlock); - pthread_mutex_lock(&mm_lock); - ofi_monitor_notify(&impmon.monitor, addr, len); - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); -} - -static int ofi_close_import(struct fid *fid) -{ - pthread_mutex_lock(&mm_state_lock); - impmon.monitor.state = FI_MM_STATE_IDLE; - pthread_mutex_unlock(&mm_state_lock); - impmon.impfid = NULL; - return 0; -} - -static struct fi_ops_mem_notify import_ops = { - .size = sizeof(struct fi_ops_mem_notify), - .notify = ofi_import_monitor_notify, -}; - -static struct fi_ops impfid_ops = { - .size = sizeof(struct fi_ops), - .close = ofi_close_import, - .bind = fi_no_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, - .tostr = fi_no_tostr, - .ops_set = fi_no_ops_set, -}; - -int ofi_monitor_import(struct fid *fid) -{ - struct fid_mem_monitor *impfid; - - if (fid->fclass != FI_CLASS_MEM_MONITOR) - return -FI_ENOSYS; - - if (impmon.impfid) { - FI_WARN(&core_prov, FI_LOG_MR, - "imported monitor already exists\n"); - return -FI_EBUSY; - } - - if (default_monitor && !dlist_empty(&default_monitor->list)) { - FI_WARN(&core_prov, FI_LOG_MR, - "cannot replace active monitor\n"); - return -FI_EBUSY; - } - - impfid = container_of(fid, struct fid_mem_monitor, fid); - if (impfid->export_ops->size < sizeof(struct fi_ops_mem_monitor)) - return -FI_EINVAL; - - impmon.impfid = impfid; - impfid->fid.context = &impmon; - impfid->fid.ops = &impfid_ops; - impfid->import_ops = &import_ops; - - FI_INFO(&core_prov, FI_LOG_MR, - "setting imported memory monitor as default\n"); - default_monitor = &impmon.monitor; - return 0; -} diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index f2148e56267..2f0af31fc4c 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -125,15 +125,18 @@ static void util_mr_free_entry(struct ofi_mr_cache *cache, static void util_mr_uncache_entry_storage(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - /* Without subscription context, we might unsubscribe from - * an address range in use by another region. As a result, - * we remain subscribed. This may result in extra - * notification events, but is harmless to correct operation. - */ + enum fi_hmem_iface iface = entry->info.iface; + struct ofi_mem_monitor *monitor = cache->monitors[iface]; ofi_rbmap_delete(&cache->tree, entry->node); entry->node = NULL; + /* Some memory monitors have a subscription context per MR. These + * memory monitors require ofi_monitor_unsubscribe() to be called. + */ + ofi_monitor_unsubscribe(monitor, entry->info.iov.iov_base, + entry->info.iov.iov_len, &entry->hmem_info); + cache->cached_cnt--; cache->cached_size -= entry->info.iov.iov_len; } @@ -267,7 +270,7 @@ void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) * restart the entire operation. */ static int -util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, +util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, struct ofi_mr_entry **entry) { struct ofi_mr_entry *cur; @@ -291,6 +294,12 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, if (ret) goto free; + /* Providers may have expanded the MR. Update MR info input + * accordingly. + */ + assert(ofi_iov_within(&(*info).iov, &(*entry)->info.iov)); + *info = (*entry)->info; + pthread_mutex_lock(&mm_lock); cur = ofi_mr_rbt_find(&cache->tree, info); if (cur) { @@ -329,7 +338,7 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, return ret; } -int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, +int ofi_mr_cache_search(struct ofi_mr_cache *cache, struct ofi_mr_info *info, struct ofi_mr_entry **entry) { struct ofi_mem_monitor *monitor; diff --git a/prov/util/src/util_mr_map.c b/prov/util/src/util_mr_map.c index f08e350b4db..be337247ad7 100644 --- a/prov/util/src/util_mr_map.c +++ b/prov/util/src/util_mr_map.c @@ -285,6 +285,14 @@ void ofi_mr_update_attr(uint32_t user_version, uint64_t caps, cur_abi_attr->page_size = user_attr->page_size; else cur_abi_attr->page_size = 0; + + if (FI_VERSION_GE(user_version, FI_VERSION(2, 0))) { + cur_abi_attr->base_mr = user_attr->base_mr; + cur_abi_attr->sub_mr_cnt = user_attr->sub_mr_cnt; + } else { + cur_abi_attr->base_mr = NULL; + cur_abi_attr->sub_mr_cnt = 0; + } } int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, diff --git a/prov/util/src/util_srx.c b/prov/util/src/util_srx.c index 0035a1a067e..c2c18599b0e 100644 --- a/prov/util/src/util_srx.c +++ b/prov/util/src/util_srx.c @@ -69,6 +69,7 @@ static void util_init_rx_entry(struct util_rx_entry *entry, entry->peer_entry.context = context; entry->peer_entry.tag = tag; entry->peer_entry.flags = flags; + entry->peer_entry.msg_size = ofi_total_iov_len(iov, count); } static struct util_rx_entry *util_get_recv_entry(struct util_srx_ctx *srx, @@ -191,6 +192,8 @@ static int util_match_msg(struct fid_peer_srx *srx, util_entry->peer_entry.srx = srx; srx_ctx->update_func(srx_ctx, util_entry); } + util_entry->peer_entry.msg_size = MIN(util_entry->peer_entry.msg_size, + attr->msg_size); *rx_entry = &util_entry->peer_entry; return ret; } @@ -268,6 +271,8 @@ static int util_match_tag(struct fid_peer_srx *srx, ret = -FI_ENOENT; util_entry->peer_entry.srx = srx; out: + util_entry->peer_entry.msg_size = MIN(util_entry->peer_entry.msg_size, + attr->msg_size); *rx_entry = &util_entry->peer_entry; return ret; } @@ -496,6 +501,33 @@ static struct util_rx_entry *util_search_unexp_msg(struct util_srx_ctx *srx, return util_search_peer_msg(ofi_array_at(&srx->src_unexp_peers, addr)); } +static bool util_unexp_mrecv(struct util_srx_ctx *srx, + struct util_rx_entry *mrecv_entry, + struct util_rx_entry *rx_entry) +{ + mrecv_entry->multi_recv_ref++; + rx_entry->peer_entry.owner_context = mrecv_entry; + + rx_entry->peer_entry.iov[0].iov_base = + mrecv_entry->peer_entry.iov->iov_base; + rx_entry->peer_entry.iov->iov_len = + MIN(mrecv_entry->peer_entry.iov->iov_len, + rx_entry->peer_entry.msg_size); + *rx_entry->peer_entry.desc = mrecv_entry->peer_entry.desc[0]; + + rx_entry->peer_entry.count = 1; + rx_entry->peer_entry.addr = mrecv_entry->peer_entry.addr; + rx_entry->peer_entry.context = mrecv_entry->peer_entry.context; + rx_entry->peer_entry.tag = mrecv_entry->peer_entry.tag; + rx_entry->peer_entry.flags |= mrecv_entry->peer_entry.flags & + ~FI_MULTI_RECV; + rx_entry->peer_entry.msg_size = rx_entry->peer_entry.iov->iov_len; + + return util_adjust_multi_recv(srx, &mrecv_entry->peer_entry, + rx_entry->peer_entry.msg_size); + +} + static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, const struct iovec *iov, void **desc, size_t iov_count, fi_addr_t addr, void *context, uint64_t flags) @@ -510,7 +542,8 @@ static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, ofi_genlock_lock(srx->lock); mrecv_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, - context, 0, 0, flags); + context, 0, 0, + flags | FI_MSG | FI_RECV); if (!mrecv_entry) { ret = -FI_ENOMEM; goto out; @@ -520,15 +553,7 @@ static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, rx_entry = util_search_unexp_msg(srx, addr); while (rx_entry) { - util_init_rx_entry(rx_entry, mrecv_entry->peer_entry.iov, desc, - iov_count, addr, context, 0, - flags & (~FI_MULTI_RECV)); - mrecv_entry->multi_recv_ref++; - rx_entry->peer_entry.owner_context = mrecv_entry; - - if (util_adjust_multi_recv(srx, &mrecv_entry->peer_entry, - rx_entry->peer_entry.msg_size)) - buf_done = true; + buf_done = util_unexp_mrecv(srx, mrecv_entry, rx_entry); srx->update_func(srx, rx_entry); ret = rx_entry->peer_entry.srx->peer_ops->start_msg( @@ -695,7 +720,8 @@ ssize_t util_srx_generic_trecv(struct fid_ep *ep_fid, const struct iovec *iov, assert(queue); rx_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, context, tag, - ignore, flags); + ignore, + flags | FI_TAGGED | FI_RECV); if (!rx_entry) ret = -FI_ENOMEM; else @@ -741,10 +767,11 @@ ssize_t util_srx_generic_recv(struct fid_ep *ep_fid, const struct iovec *iov, ofi_array_at(&srx->src_recv_queues, addr); assert(queue); rx_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, - context, 0, 0, flags); + context, 0, 0, + flags | FI_MSG | FI_RECV); if (!rx_entry) ret = -FI_ENOMEM; - else + else slist_insert_tail((struct slist_entry *) (&rx_entry->peer_entry), queue); goto out; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 63aea82778d..470f05efedf 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -191,7 +191,7 @@ ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags) } if (vrb_wr_consumes_recv(wr)) { - if (!ep->peer_rq_credits || + if (!ep->peer_rq_credits || (ep->peer_rq_credits == 1 && !(flags & OFI_PRIORITY))) /* Last credit is reserved for credit update */ goto freectx; @@ -1161,7 +1161,7 @@ static struct fi_ops vrb_ep_ops = { .close = vrb_ep_close, .bind = vrb_ep_bind, .control = vrb_ep_control, - .ops_open = fi_no_ops_open, + .ops_open = vrb_ep_ops_open, }; static struct fi_ops_cm vrb_dgram_cm_ops = { @@ -1394,7 +1394,6 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info, *ep_fid = &ep->util_ep.ep_fid; ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops; ep->util_ep.ep_fid.ops = &vrb_ep_base_ops; - (*ep_fid)->fid.ops->ops_open = vrb_ep_ops_open; vrb_prof_func_end("vrb_open_ep"); diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index f9bc78a828f..f5f37fe9c17 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -620,7 +620,7 @@ vrb_eq_xrc_conn_event(struct vrb_xrc_ep *ep, return -FI_EAGAIN; } -static size_t +static ssize_t vrb_eq_xrc_recip_conn_event(struct vrb_eq *eq, struct vrb_xrc_ep *ep, struct rdma_cm_event *cma_event, @@ -787,7 +787,7 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq, return FI_SUCCESS; } -static int +static ssize_t vrb_eq_xrc_connected_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, int *acked, struct fi_eq_cm_entry *entry, size_t len, @@ -795,7 +795,7 @@ vrb_eq_xrc_connected_event(struct vrb_eq *eq, { struct vrb_xrc_ep *ep; fid_t fid = cma_event->id->context; - int ret; + ssize_t ret; ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); diff --git a/prov/verbs/src/verbs_mr.c b/prov/verbs/src/verbs_mr.c index 9c7199d99eb..47ccf9d97ce 100644 --- a/prov/verbs/src/verbs_mr.c +++ b/prov/verbs/src/verbs_mr.c @@ -142,12 +142,9 @@ vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *base_addr md->mr = ibv_reg_dmabuf_mr(md->domain->pd, (uintptr_t) buf, len, (uintptr_t) base_addr + (uintptr_t) buf, (int) device, vrb_access); - else if (vrb_gl_data.dmabuf_support && - (iface == FI_HMEM_ZE || - iface == FI_HMEM_SYNAPSEAI || - iface == FI_HMEM_ROCR)) + else if (vrb_gl_data.dmabuf_support && iface != FI_HMEM_SYSTEM) md->mr = vrb_reg_hmem_dmabuf(iface, md->domain->pd, buf, len, - vrb_access); + vrb_access); else #endif md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, diff --git a/src/common.c b/src/common.c index 1c29350fe28..0d641ac74a1 100644 --- a/src/common.c +++ b/src/common.c @@ -1053,19 +1053,19 @@ size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, return len; } -void ofi_straddr_log_internal(const char *func, int line, +void ofi_straddr_log_internal(const char *func, int line, uint32_t addr_format, const struct fi_provider *prov, enum fi_log_level level, enum fi_log_subsys subsys, char *log_str, const void *addr) { char buf[OFI_ADDRSTRLEN]; - uint32_t addr_format; size_t len = sizeof(buf); if (fi_log_enabled(prov, level, subsys)) { if (addr) { - addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); + if (addr_format == FI_FORMAT_UNSPEC) + addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str, ofi_straddr(buf, &len, addr_format, addr)); } else { @@ -2385,7 +2385,7 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) break; case 4: case 8: - speed_val = 8 * gbit_2_bit_coef; + speed_val = 10 * gbit_2_bit_coef; break; case 16: speed_val = 14 * gbit_2_bit_coef; @@ -2393,6 +2393,12 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) case 32: speed_val = 25 * gbit_2_bit_coef; break; + case 64: + speed_val = 50 * gbit_2_bit_coef; + break; + case 128: + speed_val = 100 * gbit_2_bit_coef; + break; default: speed_val = 0; break; @@ -2411,6 +2417,9 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) case 8: width_val = 12; break; + case 16: + width_val = 2; + break; default: width_val = 0; break; diff --git a/src/fabric.c b/src/fabric.c index b1a735638bb..c9eca76ae7e 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -262,6 +262,11 @@ static int ofi_is_hook_prov(const struct fi_provider *provider) return ofi_prov_ctx(provider)->type == OFI_PROV_HOOK; } +static int ofi_is_lnx_prov(const struct fi_provider *provider) +{ + return ofi_prov_ctx(provider)->type == OFI_PROV_LNX; +} + int ofi_apply_filter(struct ofi_filter *filter, const char *name) { if (!filter->names) @@ -500,6 +505,8 @@ static void ofi_set_prov_type(struct fi_provider *provider) ofi_prov_ctx(provider)->type = OFI_PROV_UTIL; else if (ofi_has_offload_prefix(provider->name)) ofi_prov_ctx(provider)->type = OFI_PROV_OFFLOAD; + else if (ofi_is_lnx(provider->name)) + ofi_prov_ctx(provider)->type = OFI_PROV_LNX; else ofi_prov_ctx(provider)->type = OFI_PROV_CORE; } @@ -988,6 +995,7 @@ void fi_ini(void) ofi_register_provider(SOCKETS_INIT, NULL); ofi_register_provider(TCP_INIT, NULL); + ofi_register_provider(LNX_INIT, NULL); ofi_register_provider(HOOK_PERF_INIT, NULL); ofi_register_provider(HOOK_TRACE_INIT, NULL); ofi_register_provider(HOOK_PROFILE_INIT, NULL); @@ -1022,9 +1030,9 @@ FI_DESTRUCTOR(fi_fini(void)) } ofi_free_filter(&prov_filter); + ofi_shm_p2p_cleanup(); ofi_monitors_cleanup(); ofi_hmem_cleanup(); - ofi_shm_p2p_cleanup(); ofi_hook_fini(); ofi_mem_fini(); fi_log_fini(); @@ -1207,8 +1215,12 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr, core_name = attr->prov_name; if (core_name) { - assert(ofi_is_util_prov(prov)); - attr->prov_name = ofi_strdup_append(core_name, prov->name); + if (ofi_is_lnx_prov(prov)) { + attr->prov_name = ofi_strdup_link_append(core_name, prov->name); + } else { + assert(ofi_is_util_prov(prov)); + attr->prov_name = ofi_strdup_append(core_name, prov->name); + } free(core_name); } else { attr->prov_name = strdup(prov->name); @@ -1557,7 +1569,9 @@ int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, fi_ini(); - top_name = strrchr(attr->prov_name, OFI_NAME_DELIM); + ret = ofi_is_linked(attr->prov_name); + top_name = strrchr(attr->prov_name, + ret ? OFI_NAME_LNX_DELIM : OFI_NAME_DELIM); if (top_name) top_name++; else diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 910dfd1214b..420f0cca2f6 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -259,6 +259,7 @@ static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) CASEENUMSTRN(FI_PROTO_SM2, len); CASEENUMSTRN(FI_PROTO_CXI_RNR, len); CASEENUMSTRN(FI_PROTO_LPP, len); + CASEENUMSTRN(FI_PROTO_LNX, len); default: ofi_strncatf(buf, len, "Unknown"); break; diff --git a/src/hmem.c b/src/hmem.c index a624f8dddff..7c3fa57d619 100644 --- a/src/hmem.c +++ b/src/hmem.c @@ -141,6 +141,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_system_dev_reg_copy, .dev_reg_copy_from_hmem = ofi_hmem_system_dev_reg_copy, .get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_CUDA] = { .initialized = false, @@ -167,6 +168,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem, .get_dmabuf_fd = cuda_get_dmabuf_fd, + .put_dmabuf_fd = cuda_put_dmabuf_fd, }, [FI_HMEM_ROCR] = { .initialized = false, @@ -193,6 +195,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = rocr_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = rocr_dev_reg_copy_from_hmem, .get_dmabuf_fd = rocr_hmem_get_dmabuf_fd, + .put_dmabuf_fd = rocr_hmem_put_dmabuf_fd, }, [FI_HMEM_ZE] = { .initialized = false, @@ -219,6 +222,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ze_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ze_dev_reg_copy_from_hmem, .get_dmabuf_fd = ze_hmem_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_NEURON] = { .initialized = false, @@ -244,6 +248,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem, .get_dmabuf_fd = neuron_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_SYNAPSEAI] = { .initialized = false, @@ -269,6 +274,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem, .get_dmabuf_fd = synapseai_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, }; @@ -820,3 +826,8 @@ int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface iface, const void *addr, { return hmem_ops[iface].get_dmabuf_fd(addr, size, fd, offset); } + +int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd) +{ + return hmem_ops[iface].put_dmabuf_fd(fd); +} diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 1c8abb03285..7fcd6450940 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -487,22 +487,17 @@ static int cuda_hmem_dl_init(void) return -FI_ENOSYS; } - cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_attr.driver_handle) { FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libcuda.so\n"); + "Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cuda_runtime; } - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); + cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (!cuda_attr.nvml_handle) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so. Trying libnvidia-ml.so.1\n"); - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!cuda_attr.nvml_handle) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n"); - } + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n"); } CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN) @@ -753,6 +748,16 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, #endif /* HAVE_CUDA_DMABUF */ } +int cuda_put_dmabuf_fd(int fd) +{ +#if HAVE_CUDA_DMABUF + close(fd); + return FI_SUCCESS; +#else + return -FI_ENOSYS; +#endif /* HAVE_CUDA_DMABUF */ +} + int cuda_hmem_init(void) { int ret; @@ -763,6 +768,9 @@ int cuda_hmem_init(void) "If libfabric is not compiled with gdrcopy support, " "this variable is not checked. (default: true)"); + fi_param_define(NULL, "hmem_cuda_use_dmabuf", FI_PARAM_BOOL, + "Use dma-buf for sharing buffer with hardware. (default:true)"); + ret = cuda_hmem_dl_init(); if (ret != FI_SUCCESS) return ret; @@ -936,7 +944,11 @@ bool cuda_is_gdrcopy_enabled(void) bool cuda_is_dmabuf_supported(void) { - return cuda_attr.dmabuf_supported; + int use_dmabuf = 1; + + fi_param_get_bool(NULL, "hmem_cuda_use_dmabuf", &use_dmabuf); + + return use_dmabuf && cuda_attr.dmabuf_supported; } #else @@ -1045,6 +1057,11 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, return -FI_ENOSYS; } +int cuda_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + int cuda_set_sync_memops(void *ptr) { return FI_SUCCESS; diff --git a/src/hmem_rocr.c b/src/hmem_rocr.c index bba705ba8ef..05caf3cf24b 100644 --- a/src/hmem_rocr.c +++ b/src/hmem_rocr.c @@ -130,6 +130,7 @@ struct hsa_ops { #if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF hsa_status_t (*hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset); + hsa_status_t (*hsa_amd_portable_close_dmabuf)(int dmabuf); #endif }; @@ -183,6 +184,7 @@ static struct hsa_ops hsa_ops = { .hsa_iterate_agents = hsa_iterate_agents, #if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF .hsa_amd_portable_export_dmabuf = hsa_amd_portable_export_dmabuf, + .hsa_amd_portable_close_dmabuf = hsa_amd_portable_close_dmabuf, #endif .hsa_system_get_info = hsa_system_get_info, }; @@ -863,6 +865,13 @@ static int rocr_hmem_dl_init(void) "Failed to find hsa_amd_portable_export_dmabuf\n"); goto err; } + + hsa_ops.hsa_amd_portable_close_dmabuf = dlsym(hsa_handle, "hsa_amd_portable_close_dmabuf"); + if (!hsa_ops.hsa_amd_portable_close_dmabuf) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_portable_close_dmabuf\n"); + goto err; + } #endif return FI_SUCCESS; @@ -1184,6 +1193,25 @@ int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, return FI_SUCCESS; } +int rocr_hmem_put_dmabuf_fd(int fd) +{ +#if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF + hsa_status_t hsa_ret; + + hsa_ret = hsa_ops.hsa_amd_portable_close_dmabuf(fd); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to close dmabuf handle: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + return -FI_EIO; + } + + return FI_SUCCESS; +#else + return -FI_ENOSYS; +#endif +} + #else int rocr_copy_from_dev(uint64_t device, void *dest, const void *src, @@ -1310,4 +1338,9 @@ int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, return -FI_ENOSYS; } +int rocr_hmem_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + #endif /* HAVE_ROCR */ diff --git a/src/iouring.c b/src/iouring.c index ca0b91635ae..1a2f016e902 100644 --- a/src/iouring.c +++ b/src/iouring.c @@ -168,7 +168,10 @@ ssize_t ofi_sockapi_recvv_uring(struct ofi_sockapi *sockapi, SOCKET sock, if (!sqe) return -FI_EOVERFLOW; - io_uring_prep_readv(sqe, sock, iov, cnt, flags); + /* MSG_NOSIGNAL would return ENOTSUP with io_uring */ + flags &= ~MSG_NOSIGNAL; + + io_uring_prep_readv2(sqe, sock, iov, cnt, 0, flags); io_uring_sqe_set_data(sqe, ctx); ctx->uring_sqe_inuse = true; uring->credits--; diff --git a/src/xpmem.c b/src/xpmem.c index 82b9811e01b..456a92ee821 100644 --- a/src/xpmem.c +++ b/src/xpmem.c @@ -65,7 +65,7 @@ int ofi_xpmem_init(void) char buffer[1024]; uintptr_t address_max = 0; FILE *fh; - uintptr_t low, high; + uintptr_t high; char *tmp; fi_param_define(&core_prov, "xpmem_memcpy_chunksize", FI_PARAM_SIZE_T, @@ -87,7 +87,7 @@ int ofi_xpmem_init(void) while (fgets(buffer, sizeof(buffer), fh)) { /* each line of /proc/self/maps starts with low-high in * hexidecimal (without a 0x) */ - low = strtoul(buffer, &tmp, 16); + (void) strtoul(buffer, &tmp, 16); high = strtoul(tmp + 1, NULL, 16); if (address_max < high) address_max = high; @@ -108,7 +108,7 @@ int ofi_xpmem_init(void) xpmem->pinfo.seg_id = xpmem_make(0, XPMEM_MAXADDR_SIZE, XPMEM_PERMIT_MODE, (void *) 0666); if (xpmem->pinfo.seg_id == -1) { - FI_WARN(&core_prov, FI_LOG_CORE, + FI_INFO(&core_prov, FI_LOG_CORE, "Failed to export process virtual address space for use with xpmem\n"); ret = -FI_ENODATA; goto fail; diff --git a/util/pingpong.c b/util/pingpong.c index f8af6943320..9597d305121 100644 --- a/util/pingpong.c +++ b/util/pingpong.c @@ -982,7 +982,7 @@ static int generate_test_sizes(struct pp_opts *opts, size_t tx_size, int **sizes n++; } } else { - for (i = 0;; i++) { + for (i = 0; i < 32; i++) { power_of_two = (i == 0) ? 0 : (1 << i); half_up = (i == 0) ? 1 : power_of_two + (power_of_two / 2); @@ -1875,12 +1875,12 @@ static void pp_free_res(struct ct_pingpong *ct) { PP_DEBUG("Freeing resources of test suite\n"); - if (ct->mr != &(ct->no_mr)) - PP_CLOSE_FID(ct->mr); PP_CLOSE_FID(ct->ep); PP_CLOSE_FID(ct->pep); PP_CLOSE_FID(ct->rxcq); PP_CLOSE_FID(ct->txcq); + if (ct->mr != &(ct->no_mr)) + PP_CLOSE_FID(ct->mr); PP_CLOSE_FID(ct->av); PP_CLOSE_FID(ct->eq); PP_CLOSE_FID(ct->domain);