From 0daf6332d494c0098e2148078491873c7d64916f Mon Sep 17 00:00:00 2001 From: Third Party Date: Thu, 20 Feb 2025 16:13:10 +0100 Subject: [PATCH 1/4] Temporarily remove mallocMC --- thirdParty/mallocMC/.clang-format | 165 -- thirdParty/mallocMC/.clang-tidy | 13 - thirdParty/mallocMC/.github/workflows/ci.yml | 43 - thirdParty/mallocMC/.gitignore | 37 - thirdParty/mallocMC/.pre-commit-config.yaml | 45 - thirdParty/mallocMC/.yamllint | 6 - thirdParty/mallocMC/.zenodo.json | 64 - thirdParty/mallocMC/CHANGELOG.md | 220 --- thirdParty/mallocMC/CMakeLists.txt | 89 -- thirdParty/mallocMC/CONTRIBUTING.md | 20 - thirdParty/mallocMC/INSTALL.md | 71 - thirdParty/mallocMC/LICENSE | 40 - thirdParty/mallocMC/README.md | 89 -- thirdParty/mallocMC/Usage.md | 162 -- .../mallocMC/examples/mallocMC_example01.cpp | 234 --- .../mallocMC/examples/mallocMC_example03.cpp | 137 -- .../mallocMC/alignmentPolicies/Noop.hpp | 69 - .../mallocMC/alignmentPolicies/Shrink.hpp | 151 -- .../src/include/mallocMC/allocator.hpp | 239 --- .../creationPolicies/FlatterScatter.hpp | 452 ------ .../FlatterScatter/AccessBlock.hpp | 823 ---------- .../FlatterScatter/BitField.hpp | 533 ------- .../FlatterScatter/DataPage.hpp | 42 - .../FlatterScatter/PageInterpretation.hpp | 343 ---- .../FlatterScatter/wrappingLoop.hpp | 73 - .../mallocMC/creationPolicies/OldMalloc.hpp | 92 -- .../mallocMC/creationPolicies/Scatter.hpp | 1404 ----------------- .../src/include/mallocMC/device_allocator.hpp | 122 -- .../mallocMC/distributionPolicies/Noop.hpp | 77 - .../distributionPolicies/XMallocSIMD.hpp | 194 --- .../src/include/mallocMC/mallocMC.hpp | 57 - .../mallocMC/mallocMC_allocator_handle.hpp | 65 - .../include/mallocMC/mallocMC_constraints.hpp | 91 -- .../include/mallocMC/mallocMC_hostclass.hpp | 33 - .../src/include/mallocMC/mallocMC_traits.hpp | 39 - .../src/include/mallocMC/mallocMC_utils.hpp | 216 --- .../oOMPolicies/BadAllocException.hpp | 78 - .../mallocMC/oOMPolicies/ReturnNull.hpp | 61 - .../reservePoolPolicies/AlpakaBuf.hpp | 65 - .../reservePoolPolicies/CudaSetLimits.hpp | 85 - .../mallocMC/src/include/mallocMC/version.hpp | 48 - .../tests/thread-safety/AccessBlock.cpp | 927 ----------- .../mallocMC/tests/thread-safety/BitField.cpp | 92 -- .../mallocMC/tests/thread-safety/Scatter.cpp | 859 ---------- .../mallocMC/tests/unit/AccessBlock.cpp | 532 ------- thirdParty/mallocMC/tests/unit/BitField.cpp | 247 --- .../tests/unit/PageInterpretation.cpp | 316 ---- thirdParty/mallocMC/tests/unit/PageTable.cpp | 54 - thirdParty/mallocMC/tests/unit/mocks.hpp | 76 - 49 files changed, 9990 deletions(-) delete mode 100644 thirdParty/mallocMC/.clang-format delete mode 100644 thirdParty/mallocMC/.clang-tidy delete mode 100644 thirdParty/mallocMC/.github/workflows/ci.yml delete mode 100644 thirdParty/mallocMC/.gitignore delete mode 100644 thirdParty/mallocMC/.pre-commit-config.yaml delete mode 100644 thirdParty/mallocMC/.yamllint delete mode 100644 thirdParty/mallocMC/.zenodo.json delete mode 100644 thirdParty/mallocMC/CHANGELOG.md delete mode 100644 thirdParty/mallocMC/CMakeLists.txt delete mode 100644 thirdParty/mallocMC/CONTRIBUTING.md delete mode 100644 thirdParty/mallocMC/INSTALL.md delete mode 100644 thirdParty/mallocMC/LICENSE delete mode 100644 thirdParty/mallocMC/README.md delete mode 100644 thirdParty/mallocMC/Usage.md delete mode 100644 thirdParty/mallocMC/examples/mallocMC_example01.cpp delete mode 100644 thirdParty/mallocMC/examples/mallocMC_example03.cpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/allocator.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp delete mode 100644 thirdParty/mallocMC/src/include/mallocMC/version.hpp delete mode 100644 thirdParty/mallocMC/tests/thread-safety/AccessBlock.cpp delete mode 100644 thirdParty/mallocMC/tests/thread-safety/BitField.cpp delete mode 100644 thirdParty/mallocMC/tests/thread-safety/Scatter.cpp delete mode 100644 thirdParty/mallocMC/tests/unit/AccessBlock.cpp delete mode 100644 thirdParty/mallocMC/tests/unit/BitField.cpp delete mode 100644 thirdParty/mallocMC/tests/unit/PageInterpretation.cpp delete mode 100644 thirdParty/mallocMC/tests/unit/PageTable.cpp delete mode 100644 thirdParty/mallocMC/tests/unit/mocks.hpp diff --git a/thirdParty/mallocMC/.clang-format b/thirdParty/mallocMC/.clang-format deleted file mode 100644 index 7249ac3a43..0000000000 --- a/thirdParty/mallocMC/.clang-format +++ /dev/null @@ -1,165 +0,0 @@ -# General options -Language: Cpp -Standard: c++20 -DisableFormat: false -AccessModifierOffset: -4 -AlignAfterOpenBracket: AlwaysBreak -AlignArrayOfStructures: None -AlignConsecutiveAssignments: false -AlignConsecutiveBitFields: false -AlignConsecutiveDeclarations: false -AlignConsecutiveMacros: false -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: - Kind: Never -AllowAllArgumentsOnNextLine: false -AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortEnumsOnASingleLine: false -AllowShortFunctionsOnASingleLine: None -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false -BinPackParameters: false -BitFieldColonSpacing: Both -BreakAfterAttributes: Never -BreakBeforeBinaryOperators: All -BreakBeforeBraces: Allman -BreakBeforeConceptDeclarations: Always -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: BeforeComma -BreakInheritanceList: BeforeComma -BreakStringLiterals: true -ColumnLimit: 119 -CommentPragmas: '^ COMMENT pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: Always -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -IncludeBlocks: Regroup -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: false -IndentExternBlock: AfterExternBlock -IndentGotoLabels: true -IndentPPDirectives: AfterHash -IndentRequiresClause: false -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: false -InsertNewlineAtEOF: true -IntegerLiteralSeparator: - Binary: 4 - Decimal: 3 - DecimalMinDigits: 7 - Hex: 4 -KeepEmptyLinesAtTheStartOfBlocks: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 2 -NamespaceIndentation: All -PackConstructorInitializers: CurrentLine -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 # default made explicit here -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 0 # default made explicit here -PenaltyReturnTypeOnItsOwnLine: 1000 -PointerAlignment: Left -PPIndentWidth: -1 # follow IndentWidth -QualifierAlignment: Custom -QualifierOrder: ['friend', 'static', 'inline', 'constexpr', 'type', 'const', 'volatile', 'restrict'] -ReferenceAlignment: Pointer # follow PointerAlignment -ReflowComments: true -RemoveBracesLLVM: false -RemoveSemicolon: false -RequiresClausePosition: WithPreceding -RequiresExpressionIndentation: OuterScope -ShortNamespaceLines: 0 -SortIncludes: true -SortUsingDeclarations: Lexicographic -SeparateDefinitionBlocks: Always -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: false -SpaceAroundPointerQualifiers: Default # follow PointerAlignment -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: Never -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: false -SpacesInConditionalStatement: false -SpacesInContainerLiterals: false -SpacesInCStyleCastParentheses: false -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParentheses: false -SpacesInSquareBrackets: false -TabWidth: 4 -UseCRLF: false -UseTab: Never -# Project specific options -#AttributeMacros: [] -#ForEachMacros: [] -#IfMacros: [] -IncludeCategories: - # Local headers (in "") above all else - - Regex: '"([A-Za-z0-9.\/-_])+"' - Priority: 1 - # "alpaka/foo.hpp" after local headers (occur inside alpaka) - - Regex: '"alpaka/([A-Za-z0-9.\/-_])+"' - Priority: 2 - # after local headers (occur outside alpaka in examples and test) - - Regex: '' - Priority: 3 - # C++ standard library headers are the last group to be included - - Regex: '<([A-Za-z0-9\/-_])+>' - Priority: 5 - # Includes that made it this far are third-party headers and will be placed - # below alpaka's includes - - Regex: '<([A-Za-z0-9.\/-_])+>' - Priority: 4 -# Macros: [] -# NamespaceMacros: [] -StatementAttributeLikeMacros: - - 'ALPAKA_DEVICE_VOLATILE' - - 'ALPAKA_FN_ACC' - - 'ALPAKA_FN_EXTERN' - - 'ALPAKA_FN_HOST' - - 'ALPAKA_FN_HOST_ACC' - - 'ALPAKA_FN_INLINE' - - 'ALPAKA_STATIC_ACC_MEM_CONSTANT' - - 'ALPAKA_STATIC_ACC_MEM_GLOBAL' - - 'ALPAKA_UNROLL' - - 'ALPAKA_VECTORIZE_HINT' -#StatementMacros: [] -#TypenameMacros: [] -#WhitespaceSensitiveMacros: [] diff --git a/thirdParty/mallocMC/.clang-tidy b/thirdParty/mallocMC/.clang-tidy deleted file mode 100644 index 4b599735bb..0000000000 --- a/thirdParty/mallocMC/.clang-tidy +++ /dev/null @@ -1,13 +0,0 @@ -Checks: | - *, - -*-avoid-c-arrays, - -altera*, - -*avoid-do-while, - -*constant-array-index, - -*pointer*arithmetic*, - -llvmlibc*, - -llvm-header-guard, - -fuchsia*, - -misc-non-private-member-variables-in-classes, - -cppcoreguidelines-pro-type-reinterpret-cast -HeaderFilterRegex: ".*" diff --git a/thirdParty/mallocMC/.github/workflows/ci.yml b/thirdParty/mallocMC/.github/workflows/ci.yml deleted file mode 100644 index fa8e9bf651..0000000000 --- a/thirdParty/mallocMC/.github/workflows/ci.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: Continuous Integration -on: [push, pull_request] -jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.x - - uses: pre-commit/action@v3.0.1 - - uses: pre-commit-ci/lite-action@v1.0.2 - if: always() - cpu-tests: - # This action only runs on various CPU backends. - # As such, this is not a fully-fletched production-like test. - # Hopefully, it will still save us from a few stupid mistakes. - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: sudo apt update && sudo apt install libboost-all-dev - - run: mkdir build_dir - - working-directory: build_dir - run: | - git submodule init && git submodule update - - working-directory: build_dir - run: | - cmake .. \ - -DCMAKE_CXX_FLAGS="-std=c++20 -g" \ - -Dalpaka_CXX_STANDARD=20 \ - -DmallocMC_CATCH2_PROVIDER=intern \ - -Dalpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE:BOOL=ON \ - -Dalpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE:BOOL=ON \ - -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE:BOOL=ON \ - -Dalpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE:BOOL=ON - - working-directory: build_dir - run: make -j tests examples - - working-directory: build_dir - run: ./tests - - working-directory: build_dir - run: ./mallocMC_Example01 - - working-directory: build_dir - run: ./mallocMC_Example03 diff --git a/thirdParty/mallocMC/.gitignore b/thirdParty/mallocMC/.gitignore deleted file mode 100644 index 00e52ae2fe..0000000000 --- a/thirdParty/mallocMC/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -# tmp files -*~ - -# Compiled Object files -*.slo -*.lo -*.o -/build - -# Compiled Dynamic libraries -*.so -*.dylib - -# Compiled Static libraries -*.lai -*.la -*.a - -# netbeans project files -/nbproject/ - -# Code::Blocks project files -/*.cbp -/*.layout - -# Visual Studio Code configuration files -.vscode -.vs - -# JetBrains project files -.idea/ - -# original backup files -*.orig - -.cache -compile_commands.json diff --git a/thirdParty/mallocMC/.pre-commit-config.yaml b/thirdParty/mallocMC/.pre-commit-config.yaml deleted file mode 100644 index b5368f3f05..0000000000 --- a/thirdParty/mallocMC/.pre-commit-config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -minimum_pre_commit_version: 3.2.0 # necessitated by Lucas-C's hooks -default_install_hook_types: [pre-commit, pre-push] -exclude: | - (?x)^( - alpaka/.*| - thirdParty/.* - )$ -repos: - - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 - hooks: - - id: clang-format - files: \.(cpp|hpp) - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: no-commit-to-branch - args: [-b, dev] - - id: check-merge-conflict - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-toml - - id: check-yaml - args: ["--allow-multiple-documents"] - - id: mixed-line-ending - - id: check-executables-have-shebangs - - id: check-shebang-scripts-are-executable - - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.5.4 - hooks: - - id: forbid-tabs - - id: remove-tabs - - id: forbid-crlf - - id: remove-crlf - - repo: meta - hooks: - - id: check-useless-excludes - - repo: https://github.com/google/yamlfmt - rev: v0.13.0 - hooks: - - id: yamlfmt - - repo: https://github.com/adrienverge/yamllint - rev: v1.35.1 - hooks: - - id: yamllint diff --git a/thirdParty/mallocMC/.yamllint b/thirdParty/mallocMC/.yamllint deleted file mode 100644 index 369e6ca9f4..0000000000 --- a/thirdParty/mallocMC/.yamllint +++ /dev/null @@ -1,6 +0,0 @@ -extends: default -rules: - document-start: disable - truthy: disable - comments: disable - line-length: disable diff --git a/thirdParty/mallocMC/.zenodo.json b/thirdParty/mallocMC/.zenodo.json deleted file mode 100644 index be594b1e0b..0000000000 --- a/thirdParty/mallocMC/.zenodo.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "title": "mallocMC - Memory Allocator for Many Core Architectures", - "description": "This project provides a framework for fast memory managers on many core accelerators. It is based on alpaka to run on many different accelerators and implements multiple algorithms.", - "keywords": [ - "mallocMC", - "CUDA", - "manycore", - "GPU", - "allocator" - ], - "language": "eng", - "access_right": "open", - "license": { - "id": "MIT" - }, - "creators": [ - { - "name": "Widera, René", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0003-1642-0459" - }, - { - "name": "Lenz, Julian", - "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0001-5250-0005" - } - ], - "contributors": [ - { - "name": "Eckert, Carlchristian", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden", - "orcid": "0000-0002-6459-0842", - "type": "Other" - }, - { - "name": "Worpitz, Benjamin", - "type": "Other" - }, - { - "name": "Grund, Alexander", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0002-7196-8452", - "type": "Other" - }, - { - "name": "Huebl, Axel", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0003-1943-7141", - "type": "Other" - }, - { - "name": "Gruber, Bernhard Manfred", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, CASUS, CERN", - "orcid": "0000-0001-7848-1690", - "type": "Other" - }, - { - "name": "Bastrakov, Sergei", - "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0003-3396-6154", - "type": "Other" - } - ] -} diff --git a/thirdParty/mallocMC/CHANGELOG.md b/thirdParty/mallocMC/CHANGELOG.md deleted file mode 100644 index fbfd5fd71f..0000000000 --- a/thirdParty/mallocMC/CHANGELOG.md +++ /dev/null @@ -1,220 +0,0 @@ -Change Log / Release Log for mallocMC -================================================================ - -2.5.0crp --------- -**Date:** 2021-02-18 - -This release removes the native usage of CUDA by alpaka. -Attention: This release depends on an unreleased [alpaka 0.5.0dev](https://github.com/alpaka-group/alpaka/commit/34870a73ecf702069465aa030fbdf301c4d22c61) -version before the heavy alpaka namespace refactoring. - -### Changes to mallocMC 2.4.0crp - -**Features** -- Port to alpaka #173 - -**Bug fixes** -- fix HIP support (warpsize, activemask, compile issues) #182 -- fix data race and printf issue #189 -- fix data races in `Scatter.hpp` #190 -- fix clang cuda compile #192 - -**Misc:** -- Added alpaka subtree and switched to C++14 #176 -- Added 3rd party catch.hpp and made CMake find it #179 -- Update documentation after switch to alpaka #194 -- Update .clang-format and apply clang-format #197 - -Thanks to Bernhard Manfred Gruber and Rene Widera for contributing to this release! - -2.4.0crp --------- -**Date:** 2020-05-28 - -This release removes the Boost dependency and switched to C++11. - -### Changes to mallocMC 2.3.1crp - -**Features** - - Cleaning, remove Boost dependency & C++11 Migration #169 - -**Bug fixes** - - Choose the value for the -arch nvcc flag depending on CUDA version #164 #165 - -**Misc:** - - Travis CI: GCC 5.5.0 + CUDA 9.1.85 #170 - - Adding headers to projects and applied clang-tidy #171 - - clang-format #172 - -Thanks to Sergei Bastrakov, Bernhard Manfred Gruber and Axel Huebl for contributing to this release! - -2.3.1crp --------- -**Date:** 2019-02-14 - -A critical bug was fixed which can result in an illegal memory access. - -### Changes to mallocMC 2.3.0crp - -**Bug fixes** - - fix illegal memory access in `XMallocSIMD` #150 - -**Misc:** - - CMake: Honor `_ROOT` Env Hints #154 - - -2.3.0crp --------- -**Date:** 2018-06-11 - -This release adds support for CUDA 9 and clang's -x cuda frontend and fixes several bugs. -Global objects have been refactored to separate objects on host and device. - -### Changes to mallocMC 2.2.0crp - -**Features** - - CUDA 9 support #144 #145 - - clang++ -x cuda support #133 - - add `destructiveResize` method #136 - - heap as separate object on host and device, no more globals #116 - - use `BOOST_STATIC_CONSTEXPR` where possible #109 - -**Bug fixes** - - fix uninitialized pointers #110 #112 - - fix crash in getAvailableSlots #106 #107 - - Fix `uint32_t` cstdint #104 #105 - - fix missing boost include #142 - - fix includes from C headers #121 - - fix missing local size change in `finalizeHeap()` #135 - - check heap pointer in Scatter creation policy #126 - -**Misc:** - - better link usage and install docs #141 - - self consistent allocator #140 - - rename some shadowed variables in C++11 mode #108 - - properly enforce `-Werror` in Travis-CI #128 - - update Travis-CI image #119 - - improved docs #125 #127 - -Thanks to Carlchristian Eckert, René Widera, Axel Huebl and Alexander Grund for contributing to this release! - - -2.2.0crp -------------- -**Date:** 2015-09-25 - -This release fixes some minor bugs that occured after the release of 2.1.0crp, adds some documentation and improves the interoperability with other projects and build systems. -We closed all issues documented in -[Milestone *2.2.0crp: Stabilizing the release*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=5&state=closed) - -### Changes to mallocMC 2.1.0crp - -**Features** - - the interface now provides the host function `HeapInfoVector getHeapLocations()` to obtain information about the location and size of existing mallocMC-heaps #86 - -**Bug fixes** - - the function `getAvailableSlots` was always required in the policy classes, although the implementations might not provide it #89 - -**Misc:** - - the code relied on `__TROW` being defined, which is not available in all compilers #91 - - the CMake dependency increased to CMake >= 2.8.12.2 #92 - - a new FindmallocMC.cmake module file is provided at https://github.com/ComputationalRadiationPhysics/cmake-modules #85 - - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.1.0crp...2.2.0crp - - -2.1.0crp -------------- -**Date:** 2015-02-11 - -This release fixes some bugs that occured after the release of 2.0.1crp and reduces the interface to improve interoperability with the default CUDA allocator. -We closed all issues documented in -[Milestone *New Features*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=3&state=closed) - -### Changes to mallocMC 2.0.1crp - -**Features** - - the possibility to overwrite the default implementation of new/delete and malloc/free was removed #72. **This changes the interface**, since users are now always forced to call `mallocMC::malloc()` and `mallocMC::free()`. This is intended to improve readability and allows to use the CUDA allocator inside mallocMC. - - the policy *Scatter* now places the onpagetables data structure at the end of a page. This can greatly improve performance when using large pages and `resetfreedpages=true` #80 - -**Bug fixes** - - in the policy *Scatter*, `fullsegments` and `additional_chunks` could grow too large in certain configurations #79 - -**Misc:** - - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.1crp...2.1.0crp - - -2.0.1crp -------------- -**Date:** 2015-01-13 - -This release fixes several bugs that occured after the release of 2.0.0crp. -We closed all issues documented in -[Milestone *Bugfixes*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=4&state=closed) - -### Changes to mallocMC 2.0.0crp - -**Bug fixes** - - page table metadata was not correctly initialized with 0 #70 - - freeing pages would not work under certain circumstances #66 - - the bitmask in a page table entry could be wrong due to a racecondition #62 - - not all regions were initialized correctly #60 - - getAvailableSlots could sometimes miss blocks #59 - - the counter for elements in a page could get too high due to a racecondition #61 - - Out of Memory (OOM) Policy sometimes did not recognize allocation failures correctly #67 - -**Misc:** - - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.0crp...2.0.1crp - - -2.0.0crp -------------- -**Date:** 2014-06-02 - -This release introduces mallocMC, which contains the previous algorithm and -much code from ScatterAlloc 1.0.2crp. The project was renamed due to massive -restructurization and because the code uses ScatterAlloc as a reference -algorithm, but can be extended to include other allocators in the future. -We closed all issues documented in -[Milestone *Get Lib ready for PIConGPU*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=2&state=closed) - -### Changes to ScatterAlloc 1.0.2crp - -**Features** - - completely split into policies #17 - - configuration through structs instead of macro #17 - - function `getAvailableSlots()` #5 - - selectable data alignment #14 - - function `finalizeHeap()` #11 - -**Bug fixes:** - - build warning for cmake #33 - -**Misc:** - - verification code and examples #35 - - install routines #4 - - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/1.0.2crp...2.0.0crp - - -1.0.2crp -------------- -**Date:** 2014-01-07 - -This is our first bug fix release. -We closed all issues documented in -[Milestone *Bug fixes*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=1&state=closed) - -### Changes to 1.0.1 - -**Features:** - - added travis-ci.org support for compile tests #7 - -**Bug fixes:** - - broken cmake/compile #1 - - g++ warnings #10 - - only N-1 access blocks used instead of N #2 - - 32bit bug: allocate more than 4GB #12 - -**Misc:** - See the full changes at - https://github.com/ComputationalRadiationPhysics/scatteralloc/compare/1.0.1...1.0.2crp diff --git a/thirdParty/mallocMC/CMakeLists.txt b/thirdParty/mallocMC/CMakeLists.txt deleted file mode 100644 index c7956d16f1..0000000000 --- a/thirdParty/mallocMC/CMakeLists.txt +++ /dev/null @@ -1,89 +0,0 @@ -project(mallocMC LANGUAGES CXX) -cmake_minimum_required(VERSION 3.18) - -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -if(POLICY CMP0074) - cmake_policy(SET CMP0074 NEW) -endif() - -# find alpaka -set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used") -set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern") -mark_as_advanced(mallocMC_ALPAKA_PROVIDER) -if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern") - set(alpaka_BUILD_EXAMPLES OFF) - set(BUILD_TESTING OFF) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka) -else() - find_package(alpaka HINTS $ENV{ALPAKA_ROOT}) -endif() - -if(NOT TARGET alpaka::alpaka) - message(FATAL "Required mallocMC dependency alpaka could not be found!") -endif() - -# Catch2 -set(mallocMC_CATCH2_PROVIDER "intern" CACHE STRING "Select which Catch2 is used") -set_property(CACHE mallocMC_CATCH2_PROVIDER PROPERTY STRINGS "intern;extern") -mark_as_advanced(mallocMC_CATCH2_PROVIDER) - -# for installation, just copy include folder to install folder -install( - DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/src/include/." - DESTINATION include -) - -# warnings -add_library(warnings INTERFACE) -if(CMAKE_COMPILER_IS_GNUCXX) - target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs) -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - target_compile_options(warnings INTERFACE -Wall -Wshadow) -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI") - target_compile_options(warnings INTERFACE -Minform=inform) -endif() - -# Executables -file(GLOB_RECURSE headers src/include/**) -add_custom_target(mallocMCIde SOURCES ${headers}) # create a target with the header files for IDE projects -source_group(TREE ${CMAKE_CURRENT_LIST_DIR}/src/include FILES ${headers}) - -alpaka_add_executable(mallocMC_Example01 EXCLUDE_FROM_ALL examples/mallocMC_example01.cpp) -target_include_directories(mallocMC_Example01 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) -target_link_libraries(mallocMC_Example01 PUBLIC alpaka::alpaka warnings) - -alpaka_add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_example03.cpp) -target_include_directories(mallocMC_Example03 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) -target_link_libraries(mallocMC_Example03 PUBLIC alpaka::alpaka warnings) - -add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03) - -if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern") - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/thirdParty/catch2 ${CMAKE_BINARY_DIR}/catch2) - include(Catch) -else() - # get Catch2 v3 and build it from source with the same C++ standard as the tests - Include(FetchContent) - FetchContent_Declare(Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git GIT_TAG v3.7.1) - FetchContent_MakeAvailable(Catch2) - target_compile_features(Catch2 PUBLIC cxx_std_20) - include(Catch) - - # hide Catch2 cmake variables by default in cmake gui - get_cmake_property(variables VARIABLES) - foreach (var ${variables}) - if (var MATCHES "^CATCH_") - mark_as_advanced(${var}) - endif() - endforeach() -endif() - -file(GLOB_RECURSE testSources "${CMAKE_CURRENT_SOURCE_DIR}/tests/*/*.cpp") -alpaka_add_executable(tests EXCLUDE_FROM_ALL ${testSources}) -catch_discover_tests(tests) -source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources}) -target_compile_features(tests PRIVATE cxx_std_20) -target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) -target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain) diff --git a/thirdParty/mallocMC/CONTRIBUTING.md b/thirdParty/mallocMC/CONTRIBUTING.md deleted file mode 100644 index 64e12b31af..0000000000 --- a/thirdParty/mallocMC/CONTRIBUTING.md +++ /dev/null @@ -1,20 +0,0 @@ -# Contributing - -## Formatting - -Please format your code before before opening pull requests using clang-format and the .clang-format file placed in the repository root. - -### Visual Studio and CLion -Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1. -The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey. - -### Bash -First install clang-format. Instructions therefore can be found on the web. -To format your changes since branching off `dev`, you can run this command in bash: -``` -git clang-format dev -``` -To format all code in your working copy, you can run this command in bash: -``` -find -iname *.cpp -o -iname *.hpp | xargs clang-format -i -``` diff --git a/thirdParty/mallocMC/INSTALL.md b/thirdParty/mallocMC/INSTALL.md deleted file mode 100644 index 4e06d82097..0000000000 --- a/thirdParty/mallocMC/INSTALL.md +++ /dev/null @@ -1,71 +0,0 @@ -Install -------- -### Dependencies - - C++20 compiler (clang, gcc, hipcc, icc, nvcc) - - *Debian/Ubuntu:* `sudo apt-get install gcc build-essential` - - *Arch Linux:* `sudo pacman -S base-devel` - - `alpaka` 1.2.0 - - included as git submodule - - `boost` >= 1.65.1 - - dependency of alpaka - - *Debian/Ubuntu:* `sudo apt-get install libboost-dev libboost-program-options-dev` - - *Arch Linux:* `sudo pacman -S boost` - - or download from [http://www.boost.org/](http://sourceforge.net/projects/boost/files/boost/1.55.0/boost_1_55_0.tar.gz/download) - - `CMake` >= 3.15 - - *Debian/Ubuntu:* `sudo apt-get install cmake file cmake-curses-gui` - - *Arch Linux:* `sudo pacman -S cmake` - - `git` >= 1.7.9.5 - - *Debian/Ubuntu:* `sudo apt-get install git` - - *Arch Linux:* `sudo pacman -S git` - - -### Examples -This is an example how to compile `mallocMC` and test the example code snippets - -1. **Setup directories:** - - `mkdir -p build` -2. **Download the source code:** - - `git clone https://github.com/alpaka-group/mallocMC.git` -3. **Build** - - `cd build` - - `cmake ../mallocMC -DCMAKE_INSTALL_PREFIX=$HOME/libs` - - `make examples` - - `make install` (optional) -4. **Run the examples** - - `./mallocMC_Example01` - - `./mallocMC_Example02` - - `./VerifyHeap` - - additional options: see `./VerifyHeap --help` - - -Linking to your Project ------------------------ - -To use mallocMC in your project, you must include the header `mallocMC/mallocMC.hpp` and -add the correct include path. - -Because we are linking to Boost and CUDA, the following **external dependencies** must be linked: -- `-lboost` - -If you are using CMake you can download our `FindmallocMC.cmake` module with -```bash -wget https://raw.githubusercontent.com/ComputationalRadiationPhysics/cmake-modules/dev/FindmallocMC.cmake -# read the documentation -cmake -DCMAKE_MODULE_PATH=. --help-module FindmallocMC | less -``` - -and use the following lines in your `CMakeLists.txt`: -```cmake -# this example will require at least CMake 3.15 -CMAKE_MINIMUM_REQUIRED(VERSION 3.15) - -# add path to FindmallocMC.cmake, e.g., in the directory in cmake/ -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/) - -# find mallocMC installation -find_package(mallocMC 2.6.0 REQUIRED) - -alpaka_add_executable(yourBinary ${SOURCES}) -target_include_directories(yourBinary PUBLIC ${mallocMC_INCLUDE_DIRS}) -target_link_libraries(yourBinary PUBLIC alpaka::alpaka) -``` diff --git a/thirdParty/mallocMC/LICENSE b/thirdParty/mallocMC/LICENSE deleted file mode 100644 index 7c7870ae48..0000000000 --- a/thirdParty/mallocMC/LICENSE +++ /dev/null @@ -1,40 +0,0 @@ -/* - mallocMC: Memory Allocation for Many Core Architectures - - based on the work of ScatterAlloc: - Massively Parallel Dynamic Memory Allocation for the GPU - - http://www.icg.tugraz.at/project/mvp - https://www.hzdr.de/crp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Bernhard Kainz - kainz ( at ) icg.tugraz.at - Michael Kenzel - kenzel ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ diff --git a/thirdParty/mallocMC/README.md b/thirdParty/mallocMC/README.md deleted file mode 100644 index b99fa52e2d..0000000000 --- a/thirdParty/mallocMC/README.md +++ /dev/null @@ -1,89 +0,0 @@ -mallocMC -============= - -mallocMC: *Memory Allocator for Many Core Architectures* - -This project provides a framework for **fast memory managers** on **many core -accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka) -to run on many different accelerators and comes with multiple allocation -algorithms out-of-the-box. Custom ones can be added easily due to the -policy-based design. - -Usage -------- - -Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your -`new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket: - -Install -------- - -mallocMC is header-only, but requires a few other C++ libraries to be -available. Our installation notes can be found in [INSTALL.md](INSTALL.md). - -Contributing ------------- - -Rules for contributions are found in [CONTRIBUTING.md](./CONTRIBUTING.md). - -On the Algorithms ------------------------------ - -This library was originally inspired by the *ScatterAlloc* algorithm, -[forked](https://en.wikipedia.org/wiki/Fork_%28software_development%29) -from the **ScatterAlloc** project, developed by the -[Managed Volume Processing](http://www.icg.tugraz.at/project/mvp) -group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at), -TU Graz (kudos!). The currently shipped algorithms are using similar ideas but -differ from the original one significantly. - -From the original project page (which is no longer existent to the best of our -knowledge): - -```quote -ScatterAlloc is a dynamic memory allocator for the GPU. It is -designed concerning the requirements of massively parallel -execution. - -ScatterAlloc greatly reduces collisions and congestion by -scattering memory requests based on hashing. It can deal with -thousands of GPU-threads concurrently allocating memory and its -execution time is almost independent of the thread count. - -ScatterAlloc is open source and easy to use in your CUDA projects. -``` - -Our Homepage: - -Versions and Releases ---------------------- - -Official releases can be found in the -[Github releases](https://github.com/alpaka-group/mallocMC/releases). -We try to stick to [semantic versioning](https://semver.org/) but we'll bump -the major version number for major features. -Development happens on the `dev` branch. -Changes there have passed the CI and a code review but we make no guarantees -about API or feature stability in this branch. - -Literature ----------- - -Just an incomplete link collection for now: - -- [Paper](https://doi.org/10.1109/InPar.2012.6339604) by - Markus Steinberger, Michael Kenzel, Bernhard Kainz and Dieter Schmalstieg - -- 2012, May 5th: [Presentation](http://innovativeparallel.org/Presentations/inPar_kainz.pdf) - at *Innovative Parallel Computing 2012* by *Bernhard Kainz* - -- Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by - Carlchristian Eckert (2014) - -License -------- - -We distribute the modified software under the same license as the -original software from TU Graz (by using the -[MIT License](https://en.wikipedia.org/wiki/MIT_License)). -Please refer to the [LICENSE](LICENSE) file. diff --git a/thirdParty/mallocMC/Usage.md b/thirdParty/mallocMC/Usage.md deleted file mode 100644 index 45963ee032..0000000000 --- a/thirdParty/mallocMC/Usage.md +++ /dev/null @@ -1,162 +0,0 @@ -Usage -===== - -Step 1: include ---------------- - -There is one header file that will include *all* necessary files: - -```c++ -#include -``` - -Step 2a: choose policies ------------------------ - -Each instance of a policy based allocator is composed through 5 **policies**. -Each policy is expressed as a **policy class**. - -Currently, there are the following policy classes available: - -|Policy | Policy Classes (implementations) | description | -|------- |----------------------------------| ----------- | -|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| -| | FlatterScatter`` | Another scattered allocation algorithm similar in spirit to `Scatter` but with a flatter hierarchy and stronger concurrency invariants. `conf1` and `conf2` act as before. -| | OldMalloc | Device-side malloc/new and free/delete syscalls as implemented on the given device. -|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | -| | Noop | no workload distribution at all | -|**OOMPolicy** | ReturnNull | pointers will be *nullptr*, if the request could not be fulfilled | -| | ~~BadAllocException~~ | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions | -|**ReservePoolPolicy** | AlpakaBuf | Allocate a fixed-size buffer in an `alpaka`-provided container. | -| | CudaSetLimits | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) | -|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| -| | Noop | no alignment at all | - -The user has to choose one of each policy that will form a useful allocator -(see [here](Usage.md#2c-combine-policies)) - -Step 2b: configure policies ---------------------------- - -Some of those policies are templates that can be configured through a -configuration struct. The default struct can be accessed through -```PolicyNamespace::PolicyClass<>::Properties```, which allows to -inherit a struct to modify some of its parameters before passing it -to the policy class: - -```c++ -// configure the AlignmentPolicy "Shrink" -struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties { - static constexpr auto dataAlignment = 16; -}; -``` - -Step 2c: combine policies -------------------------- - -After configuring the chosen policies, they can be used as template -parameters to create the desired allocator type: - -```c++ -using namespace mallocMC; - -using Allocator1 = mallocMC::Allocator< - CreationPolicy::OldMalloc, - DistributionPolicy::Noop, - OOMPolicy::ReturnNull, - ReservePoolPolicy::CudaSetLimits, - AlignmentPolicy::Noop ->; -``` - -`Allocator1` will resemble the behaviour of classical device-side allocation known -from NVIDIA CUDA since compute capability sm_20. To get a more novel allocator, one -could create the following alias instead: - -```c++ -using namespace mallocMC; - -using ScatterAllocator = mallocMC::Allocator< - CreationPolicies::Scatter<>, - DistributionPolicies::XMallocSIMD<>, - OOMPolicies::ReturnNull, - ReservePoolPolicies::SimpleCudaMalloc, - AlignmentPolicies::Shrink ->; -``` - -Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated without -template arguments to use the default configuration. `Shrink` however uses the -configuration struct defined above. - -Step 3: instantiate allocator ------------------------------ - -To use the defined allocator type, create an instance with the desired heap size: - -```c++ -ScatterAllocator sa( 512U * 1024U * 1024U ); // heap size of 512MiB -``` - -The allocator object offers the following methods - -| Name | description | -|---------------------- |-------------------------| -| getAllocatorHandle() | Acquire a handle from the allocator that can be used in kernels to allocate memory on device. -| getAvailableSlots(size_t) | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | - -One should note that on a running system with multiple threads manipulating -memory the information provided by `getAvailableSlots` is stale the moment it's -acquired and so relying on this information to be accurate is not recommended. -It is supposed to be used in initialisation/finalisation phases without dynamic -memory allocations or in tests. - -Step 4: use dynamic memory allocation in a kernel -------------------------------------------------- - -A handle to the allocator object must be passed to each kernel. The handle type is defined as an internal type of the allocator. Inside the kernel, this handle can be used to request memory. - -The handle offers the following methods: - -| Name | description | -|---------------------- |-------------------------| -| malloc(size_t) | Allocates memory on the accelerator | -| free(size_t) | Frees memory on the accelerator | -| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`).| - -The comments on `getAvailableSlots` from above hold all the same. -A simplistic example would look like this: - -```c++ -#include - -namespace mallocMC = MC; - -using ScatterAllocator = MC::Allocator< - MC::CreationPolicies::Scatter<>, - MC::DistributionPolicies::XMallocSIMD<>, - MC::OOMPolicies::ReturnNull, - MC::ReservePoolPolicies::SimpleCudaMalloc, - MC::AlignmentPolicies::Shrink ->; - -__global__ exampleKernel(ScatterAllocator::AllocatorHandle sah) -{ - // some code ... - - int* a = (int*) sah.malloc(sizeof(int)*42); - - // some more code, using *a - - sah.free(a); -} - -int main(){ - ScatterAllocator sa( 1U * 512U * 1024U * 1024U ); // heap size of 512MiB - exampleKernel<<< 32, 32 >>>(sa); - - return 0; -} -``` - -For more usage examples, have a look at the [examples](examples). diff --git a/thirdParty/mallocMC/examples/mallocMC_example01.cpp b/thirdParty/mallocMC/examples/mallocMC_example01.cpp deleted file mode 100644 index 002af965f6..0000000000 --- a/thirdParty/mallocMC/examples/mallocMC_example01.cpp +++ /dev/null @@ -1,234 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/creationPolicies/FlatterScatter.hpp" -#include "mallocMC/creationPolicies/OldMalloc.hpp" - -#include -#include - -#include - -#include -#include -#include -#include -#include - -using mallocMC::CreationPolicies::FlatterScatter; -using mallocMC::CreationPolicies::OldMalloc; -using mallocMC::CreationPolicies::Scatter; - -using Dim = alpaka::DimInt<1>; -using Idx = std::size_t; - -// Define the device accelerator -using Acc = alpaka::ExampleDefaultAcc; - -constexpr uint32_t const blocksize = 2U * 1024U * 1024U; -constexpr uint32_t const pagesize = 4U * 1024U; -constexpr uint32_t const wasteFactor = 1U; - -// This happens to also work for the original Scatter algorithm, so we only define one. -struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig -{ - static constexpr auto accessblocksize = blocksize; - static constexpr auto pagesize = ::pagesize; - static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; - // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = wasteFactor; -}; - -struct XMallocConfig -{ - static constexpr auto pagesize = FlatterScatterHeapConfig::pagesize; -}; - -struct ShrinkConfig -{ - static constexpr auto dataAlignment = 16; -}; - -ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA; -ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB; -ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC; - -template -auto example01() -> int -{ - using Allocator = mallocMC::Allocator< - Acc, - T_CreationPolicy, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - - constexpr auto length = 100; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = alpaka::Queue{dev}; - - auto const devProps = alpaka::getAccDevProps(dev); - unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); - - // round up - auto grid = (length + block - 1U) / block; - assert(length <= block * grid); // necessary for used algorithm - - // init the heap - std::cerr << "initHeap..."; - auto const heapSize = 2U * 1024U * 1024U * 1024U; - Allocator scatterAlloc(dev, queue, heapSize); // 1GB for device-side malloc - std::cerr << "done\n"; - std::cout << Allocator::info("\n") << '\n'; - - // create arrays of arrays on the device - { - auto createArrayPointers - = [] ALPAKA_FN_ACC(Acc const& acc, int x, int y, Allocator::AllocatorHandle allocHandle) - { - arA = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); - arB = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); - arC = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); - }; - auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, - createArrayPointers, - grid, - block, - scatterAlloc.getAllocatorHandle())); - } - - // fill 2 of them all with ascending values - { - auto fillArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, Allocator::AllocatorHandle allocHandle) - { - auto const id = alpaka::getIdx(acc)[0]; - - arA[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); - arB[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); - arC[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); - - for(int i = 0; i < localLength; ++i) - { - arA[id][i] = static_cast(id * localLength + i); - arB[id][i] = static_cast(id * localLength + i); - } - }; - auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, fillArrays, length, scatterAlloc.getAllocatorHandle())); - } - - // add the 2 arrays (vector addition within each thread) - // and do a thread-wise reduce to sums - { - auto sumsBufferAcc = alpaka::allocBuf(dev, Idx{block * grid}); - - auto addArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, int* sums) - { - auto const id = alpaka::getIdx(acc)[0]; - - sums[id] = 0; - for(int i = 0; i < localLength; ++i) - { - arC[id][i] = arA[id][i] + arB[id][i]; - sums[id] += arC[id][i]; - } - }; - auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, addArrays, length, alpaka::getPtrNative(sumsBufferAcc))); - - auto const platformCPU = alpaka::Platform{}; - auto const hostDev = alpaka::getDevByIdx(platformCPU, 0); - - auto sumsBufferHost = alpaka::allocBuf(hostDev, Idx{block * grid}); - alpaka::memcpy(queue, sumsBufferHost, sumsBufferAcc, Idx{block * grid}); - alpaka::wait(queue); - - auto const* sumsPtr = alpaka::getPtrNative(sumsBufferHost); - auto const sum = std::accumulate(sumsPtr, sumsPtr + block * grid, size_t{0}); - std::cout << "The sum of the arrays on GPU is " << sum << '\n'; - } - - auto const n = static_cast(block * grid * length); - auto const gaussian = n * (n - 1); - std::cout << "The gaussian sum as comparison: " << gaussian << '\n'; - - /*constexpr*/ if(mallocMC::Traits::providesAvailableSlots) - { - std::cout << "there are "; - std::cout << scatterAlloc.getAvailableSlots(dev, queue, 1024U * 1024U); - std::cout << " Slots of size 1MB available\n"; - } - - { - auto freeArrays = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) - { - auto const id = alpaka::getIdx(acc)[0]; - allocHandle.free(acc, arA[id]); - allocHandle.free(acc, arB[id]); - allocHandle.free(acc, arC[id]); - }; - auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; - alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, freeArrays, scatterAlloc.getAllocatorHandle())); - } - - { - auto freeArrayPointers = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) - { - allocHandle.free(acc, arA); - allocHandle.free(acc, arB); - allocHandle.free(acc, arC); - }; - auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, freeArrayPointers, scatterAlloc.getAllocatorHandle())); - } - - return 0; -} - -auto main(int /*argc*/, char* /*argv*/[]) -> int -{ - example01>(); - example01>(); - example01(); - return 0; -} diff --git a/thirdParty/mallocMC/examples/mallocMC_example03.cpp b/thirdParty/mallocMC/examples/mallocMC_example03.cpp deleted file mode 100644 index 8e183fdeb3..0000000000 --- a/thirdParty/mallocMC/examples/mallocMC_example03.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/creationPolicies/OldMalloc.hpp" - -#include -#include - -#include - -#include -#include - -using mallocMC::CreationPolicies::FlatterScatter; -using mallocMC::CreationPolicies::OldMalloc; -using mallocMC::CreationPolicies::Scatter; - -using Dim = alpaka::DimInt<1>; -using Idx = std::size_t; - -// Define the device accelerator -using Acc = alpaka::ExampleDefaultAcc; - -constexpr uint32_t const blocksize = 2U * 1024U * 1024U; -constexpr uint32_t const pagesize = 4U * 1024U; -constexpr uint32_t const wasteFactor = 1U; - -// This happens to also work for the original Scatter algorithm, so we only define one. -struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig -{ - static constexpr auto accessblocksize = blocksize; - static constexpr auto pagesize = ::pagesize; - static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; - // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = wasteFactor; -}; - -struct AlignmentConfig -{ - static constexpr auto dataAlignment = 16; -}; - -ALPAKA_STATIC_ACC_MEM_GLOBAL int* arA = nullptr; - -template -struct ExampleKernel -{ - ALPAKA_FN_ACC void operator()(Acc const& acc, T_Allocator::AllocatorHandle allocHandle) const - { - auto const id = static_cast(alpaka::getIdx(acc)[0]); - if(id == 0) - { - arA = static_cast(allocHandle.malloc(acc, sizeof(int) * 32U)); - } - // wait the the malloc from thread zero is not changing the result for some threads - alpaka::syncBlockThreads(acc); - auto const slots = allocHandle.getAvailableSlots(acc, 1); - if(arA != nullptr) - { - arA[id] = id; - printf("id: %u array: %d slots %u\n", id, arA[id], slots); - } - else - printf("error: device size allocation failed"); - - // wait that all thread read from `arA` - alpaka::syncBlockThreads(acc); - if(id == 0) - { - allocHandle.free(acc, arA); - } - } -}; - -template -auto example03() -> int -{ - using Allocator = mallocMC::Allocator< - Acc, - T_CreationPolicy, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = alpaka::Queue{dev}; - auto const devProps = alpaka::getAccDevProps(dev); - unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); - - Allocator scatterAlloc(dev, queue, 2U * 1024U * 1024U * 1024U); // 2GB for device-side malloc - - auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{block}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle())); - - std::cout << "Slots from Host: " << scatterAlloc.getAvailableSlots(dev, queue, 1) << '\n'; - - return 0; -} - -auto main(int /*argc*/, char* /*argv*/[]) -> int -{ - example03>(); - example03>(); - example03(); - return 0; -} diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp deleted file mode 100644 index ee176187c2..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "Noop.hpp" - -#include - -#include -#include -#include - -namespace mallocMC -{ - namespace AlignmentPolicies - { - /** - * @brief a policy that does nothing - * - * This AlignmentPolicy will not perform any distribution, but only - * return its input (identity function) - */ - class Noop - { - public: - static auto alignPool(void* memory, size_t memsize) -> std::tuple - { - return std::make_tuple(memory, memsize); - } - - ALPAKA_FN_HOST_ACC - static auto applyPadding(uint32_t bytes) -> uint32_t - { - return bytes; - } - - static auto classname() -> std::string - { - return "Noop"; - } - }; - - } // namespace AlignmentPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp deleted file mode 100644 index 0eb495e975..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "Shrink.hpp" - -#include - -#include -#include -#include -#include - -namespace mallocMC -{ - namespace AlignmentPolicies - { - namespace Shrink2NS - { - template - struct __PointerEquivalent - { - using type = unsigned int; - }; - - template<> - struct __PointerEquivalent<8> - { - using type = unsigned long long; - }; - } // namespace Shrink2NS - - namespace ShrinkConfig - { - struct DefaultShrinkConfig - { - static constexpr auto dataAlignment = 16; - }; - } // namespace ShrinkConfig - - /** - * @brief Provides proper alignment of pool and pads memory requests - * - * This AlignmentPolicy is based on ideas from ScatterAlloc - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). - * It performs alignment operations on big memory pools and requests to - * allocate memory. Memory pools are truncated at the beginning until - * the pointer to the memory fits the alignment. Requests to allocate - * memory are padded until their size is a multiple of the alignment. - * - * @tparam T_Config (optional) The alignment to use - */ - template - class Shrink - { - public: - using Properties = T_Config; - - private: - using PointerEquivalent = Shrink2NS::__PointerEquivalent::type; - -/** Allow for a hierarchical validation of parameters: - * - * shipped default-parameters (in the inherited struct) have lowest precedence. - * They will be overridden by a given configuration struct. However, even the - * given configuration struct can be overridden by compile-time command line - * parameters (e.g. -D MALLOCMC_AP_SHRINK_DATAALIGNMENT 128) - * - * default-struct < template-struct < command-line parameter - */ -#ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT -# define MALLOCMC_AP_SHRINK_DATAALIGNMENT (Properties::dataAlignment) -#endif - static constexpr size_t dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT; - - // dataAlignment must be a power of 2! - static_assert( - dataAlignment != 0 && (dataAlignment & (dataAlignment - 1)) == 0, - "dataAlignment must also be a power of 2"); - - public: - static auto alignPool(void* memory, size_t memsize) -> std::tuple - { - PointerEquivalent alignmentstatus = ((PointerEquivalent) memory) & (dataAlignment - 1); - if(alignmentstatus != 0) - { - std::cout << "Heap Warning: memory to use not " << dataAlignment << " byte aligned...\n" - << "Before:\n" - << "dataAlignment: " << dataAlignment << '\n' - << "Alignmentstatus: " << alignmentstatus << '\n' - << "size_t memsize " << memsize << " byte" << '\n' - << "void *memory " << memory << '\n'; - - memory = (void*) (((PointerEquivalent) memory) + dataAlignment - alignmentstatus); - memsize -= dataAlignment + (size_t) alignmentstatus; - - std::cout << "Was shrunk automatically to: " << '\n' - << "size_t memsize " << memsize << " byte" << '\n' - << "void *memory " << memory << '\n'; - } - - return std::make_tuple(memory, memsize); - } - - ALPAKA_FN_HOST_ACC - static auto applyPadding(uint32_t bytes) -> uint32_t - { - constexpr uint32_t bitsToClear = dataAlignment - 1; - return (bytes + bitsToClear) & ~bitsToClear; - } - - ALPAKA_FN_HOST - static auto classname() -> std::string - { - std::stringstream ss; - ss << "Shrink[" << dataAlignment << "]"; - return ss.str(); - } - }; - - } // namespace AlignmentPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp b/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp deleted file mode 100644 index a0ebabf455..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp +++ /dev/null @@ -1,239 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "device_allocator.hpp" -#include "mallocMC_allocator_handle.hpp" -#include "mallocMC_constraints.hpp" -#include "mallocMC_traits.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace mallocMC -{ - namespace detail - { - template - struct GetAvailableSlotsIfAvailHost - { - template - ALPAKA_FN_HOST static auto getAvailableSlots(AlpakaDevice&, AlpakaQueue&, size_t, T_Allocator&) -> unsigned - { - return 0; - } - }; - - template - struct GetAvailableSlotsIfAvailHost - { - template - ALPAKA_FN_HOST static auto getAvailableSlots( - AlpakaDevice& dev, - AlpakaQueue& queue, - size_t slotSize, - T_Allocator& alloc) -> unsigned - { - return T_Allocator::CreationPolicy::template getAvailableSlotsHost( - dev, - queue, - slotSize, - alloc.getAllocatorHandle().devAllocator); - } - }; - } // namespace detail - - struct HeapInfo - { - void* p; - size_t size; - }; - - /** - * @brief "HostClass" that combines all policies to a useful allocator - * - * This class implements the necessary glue-logic to form an actual - * allocator from the provided policies. It implements the public interface - * and executes some constraint checking based on an instance of the class - * PolicyConstraints. - * - * @tparam T_CreationPolicy The desired type of a CreationPolicy - * @tparam T_DistributionPolicy The desired type of a DistributionPolicy - * @tparam T_OOMPolicy The desired type of a OOMPolicy - * @tparam T_ReservePoolPolicy The desired type of a ReservePoolPolicy - * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy - */ - template< - typename AlpakaAcc, - typename T_CreationPolicy, - typename T_DistributionPolicy, - typename T_OOMPolicy, - typename T_ReservePoolPolicy, - typename T_AlignmentPolicy> - class Allocator - : public PolicyConstraints< - T_CreationPolicy, - T_DistributionPolicy, - T_OOMPolicy, - T_ReservePoolPolicy, - T_AlignmentPolicy> - { - using uint32 = std::uint32_t; - - public: - using DistributionPolicy = T_DistributionPolicy; - using OOMPolicy = T_OOMPolicy; - using ReservePoolPolicy = T_ReservePoolPolicy; - using AlignmentPolicy = T_AlignmentPolicy; - using CreationPolicy = T_CreationPolicy::template AlignmentAwarePolicy; - using HeapInfoVector = std::vector; - using DevAllocator = DeviceAllocator; - using AllocatorHandle = AllocatorHandleImpl; - - private: - ReservePoolPolicy reservePolicy; - using DevAllocatorStorageBufferType - = alpaka::Buf, DevAllocator, alpaka::DimInt<1>, int>; - std::unique_ptr - devAllocatorBuffer; // FIXME(bgruber): replace by std::optional<> - HeapInfo heapInfos; - - /** allocate heap memory - * - * @param size number of bytes - */ - template - ALPAKA_FN_HOST void alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) - { - void* pool = reservePolicy.setMemPool(dev, size); - std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); - - devAllocatorBuffer - = std::make_unique(alpaka::allocBuf(dev, 1)); - CreationPolicy::template initHeap( - dev, - queue, - alpaka::getPtrNative(*devAllocatorBuffer), - pool, - size); - - heapInfos.p = pool; - heapInfos.size = size; - } - - /** free all data structures - * - * Free all allocated memory. - * After this call the instance is an in invalid state - */ - ALPAKA_FN_HOST void free() - { - devAllocatorBuffer = {}; - reservePolicy.resetMemPool(); - heapInfos.size = 0; - heapInfos.p = nullptr; - } - - /* forbid to copy the allocator */ - ALPAKA_FN_HOST - Allocator(Allocator const&) = delete; - - public: - template - ALPAKA_FN_HOST Allocator(AlpakaDevice& dev, AlpakaQueue& queue, size_t size = 8U * 1024U * 1024U) - { - alloc(dev, queue, size); - } - - ALPAKA_FN_HOST - ~Allocator() - { - free(); - } - - /** destroy current heap data and resize the heap - * - * @param size number of bytes - */ - template - ALPAKA_FN_HOST void destructiveResize(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) - { - free(); - alloc(dev, queue, size); - } - - ALPAKA_FN_HOST - auto getAllocatorHandle() -> AllocatorHandle - { - return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)}; - } - - ALPAKA_FN_HOST - operator AllocatorHandle() - { - return getAllocatorHandle(); - } - - ALPAKA_FN_HOST static auto info(std::string linebreak = " ") -> std::string - { - std::stringstream ss; - ss << "CreationPolicy: " << CreationPolicy::classname() << " " << linebreak; - ss << "DistributionPolicy: " << DistributionPolicy::classname() << "" << linebreak; - ss << "OOMPolicy: " << OOMPolicy::classname() << " " << linebreak; - ss << "ReservePoolPolicy: " << ReservePoolPolicy::classname() << " " << linebreak; - ss << "AlignmentPolicy: " << AlignmentPolicy::classname() << " " << linebreak; - return ss.str(); - } - - // polymorphism over the availability of getAvailableSlots for calling - // from the host - template - ALPAKA_FN_HOST auto getAvailableSlots(AlpakaDevice& dev, AlpakaQueue& queue, size_t slotSize) -> unsigned - { - slotSize = AlignmentPolicy::applyPadding(slotSize); - return detail::GetAvailableSlotsIfAvailHost::providesAvailableSlots>:: - template getAvailableSlots(dev, queue, slotSize, *this); - } - - ALPAKA_FN_HOST - auto getHeapLocations() -> HeapInfoVector - { - HeapInfoVector v; - v.push_back(heapInfos); - return v; - } - }; - -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter.hpp deleted file mode 100644 index 5c0b7ce285..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter.hpp +++ /dev/null @@ -1,452 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace mallocMC::CreationPolicies::FlatterScatterAlloc -{ - /** - * @class Heap - * @brief Main interface to our heap memory. - * - * This class stores the heap pointer and the heap size and provides the high-level functionality to interact with - * the memory within kernels. It is wrapped in a thin layer of creation policy to be instantiated as base class of - * the `DeviceAllocator` for the user. - * - * @tparam T_HeapConfig Struct containing information about the heap. - * @tparam T_HashConfig Struct providing a hash function for scattering and the blockStride property. - * @tparam T_AlignmentPolicy The alignment policy used in the current configuration. - */ - template - struct Heap - { - using MyAccessBlock = AccessBlock; - - static_assert( - T_HeapConfig::accessblocksize - < std::numeric_limits>::max(), - "Your access block size must be smaller than the maximal value of its signed type because we are using " - "differences in the code occasionally."); - - static_assert( - T_HeapConfig::pagesize < std::numeric_limits>::max(), - "Your page size must be smaller than the maximal value of its signed type because we are using " - "differences in the code occasionally."); - - static_assert( - T_HeapConfig::accessblocksize == sizeof(MyAccessBlock), - "The real access block must have the same size as configured in order to make alignment more easily " - "predictable."); - - size_t heapSize{}; - MyAccessBlock* accessBlocks{}; - uint32_t volatile block = 0U; - - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void - { - for(uint32_t i = 0; i < numBlocks(); ++i) - { - accessBlocks[i].init(); - } - } - - /** - * @brief Number of access blocks in the heap. This is a runtime quantity because it depends on the given heap - * size. - * - * @return Number of access blocks in the heap. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBlocks() const -> uint32_t - { - return heapSize / T_HeapConfig::accessblocksize; - } - - /** - * @brief The dummy value to indicate the case of no free blocks found. - * - * @return An invalid block index for identifying such case. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBlockFound() const -> uint32_t - { - return numBlocks(); - } - - /** - * @brief Compute a starting index to search the access blocks for a valid piece of memory. - * - * @param blockValue Current starting index to compute the next one from. - * @param hashValue A hash value to provide some entropy for scattering the requests. - * @return An index to start search the access blocks from. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBlockIndex( - auto const& /*acc*/, - uint32_t const blockValue, - uint32_t const hashValue) - { - return ((hashValue % T_HashConfig::blockStride) + (blockValue * T_HashConfig::blockStride)) % numBlocks(); - } - - /** - * @brief Create a pointer to memory of (at least) `bytes` number of bytes.. - * - * @param bytes Size of the allocation in number of bytes. - * @return Pointer to the memory, nullptr if no usable memory was found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t const bytes) -> void* - { - auto blockValue = block; - auto hashValue = T_HashConfig::template hash(acc, bytes); - auto startIdx = startBlockIndex(acc, blockValue, hashValue); - return wrappingLoop( - acc, - startIdx, - numBlocks(), - static_cast(nullptr), - [this, bytes, startIdx, &hashValue, blockValue](auto const& localAcc, auto const index) mutable - { - auto ptr = accessBlocks[index].create(localAcc, bytes, hashValue); - if(!ptr && index == startIdx) - { - // This is not thread-safe but we're fine with that. It's just a fuzzy thing to occasionally - // increment and it's totally okay if its value is not quite deterministic. - if(blockValue == block) - { - block = blockValue + 1; - } - } - return ptr; - }); - } - - /** - * @brief Counterpart free'ing operation to `create`. Destroys the memory at the pointer location. - * - * @param pointer A valid pointer created by `create()`.` - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(AlpakaAcc const& acc, void* pointer) -> void - { - // indexOf requires the access block size instead of blockSize in case the reinterpreted AccessBlock - // object is smaller than blockSize. - auto blockIndex = indexOf(pointer, accessBlocks, sizeof(MyAccessBlock)); - accessBlocks[blockIndex].destroy(acc, pointer); - } - - /** - * @brief Queries all access blocks how many chunks of the given chunksize they could allocate. This is - * single-threaded and NOT THREAD-SAFE but acquiring such distributed information while other threads operate - * on the heap is of limited value anyways. - * - * @param chunkSize Target would-be-created chunk size in number of bytes. - * @return The number of allocations that would still be possible with this chunk size. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsDeviceFunction(auto const& acc, uint32_t const chunkSize) - -> size_t - { - // TODO(lenz): Not thread-safe. - return std::transform_reduce( - accessBlocks, - accessBlocks + numBlocks(), - 0U, - std::plus{}, - [&acc, chunkSize](auto& accessBlock) { return accessBlock.getAvailableSlots(acc, chunkSize); }); - } - - /** - * @brief Forwards to `getAvailableSlotsDeviceFunction` for interface compatibility reasons. See there for - * details. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(auto const& acc, uint32_t const chunkSize) - -> size_t - { - return getAvailableSlotsDeviceFunction(acc, chunkSize); - } - - protected: - // This class is supposed to be instantiated as a parent for the `DeviceAllocator`. - Heap() = default; - }; - - constexpr uint32_t defaultBlockSize = 128U * 1024U * 1024U; - constexpr uint32_t defaultPageSize = 128U * 1024U; - - /** - * @class DefaultHeapConfig - * @brief An example configuration for the heap. - * - * A heap configuration is supposed to provide the physical dimensions of the objects in the heap (i.e. access - * block and page) as well as a function that describes how much space you are willing to waste by allowing to - * allocate larger chunks that necessary. - * - * @tparam T_blockSize The size of one access block in bytes. - * @tparam T_pageSize The size of one page in bytes. - * @return - */ - template< - uint32_t T_blockSize = defaultBlockSize, - uint32_t T_pageSize = defaultPageSize, - uint32_t T_wasteFactor = 2U> - struct DefaultHeapConfig - { - static constexpr uint32_t const accessblocksize = T_blockSize; - static constexpr uint32_t const pagesize = T_pageSize; - static constexpr uint32_t const wastefactor = T_wasteFactor; - static constexpr bool const resetfreedpages = true; - - /** - * @brief Determine whether we want to allow an allocation of numBytes on a page with chunk size `chunkSize`. - * - * This function is given the currently requested allocation size numBytes and the set chunk size of a page. It - * answers the question whether we should consider this page for allocating this memory. It must necessarily - * return false if chunkSize < numBytes in order to avoid memory corruption. It may return true in cases where - * chunkSize > numBytes to trade off a bit of wasted memory for a performance boost while searching available - * memory. - * - * @param chunkSize Currently set chunk size of a page in number of bytes. - * @param numBytes Allocation size in number of bytes. - * @return true if the algorithm shall consider this page for allocation and false otherwise. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( - auto const& /*acc*/, - uint32_t const chunkSize, - uint32_t const numBytes) - { - return (chunkSize >= numBytes && chunkSize <= wastefactor * numBytes); - } - }; - - /** - * @class DefaultFlatterScatterHashConfig - * @brief An example configuration for the hash scattering. - * - * A scatter configuration is supposed to provide two pieces of information: A static function called `hash` and - * the compile-time constant `blockStride`. These are used by the creation policy to scatter the requests for - * memory within the heap. - * - */ - struct DefaultFlatterScatterHashConfig - { - public: - static constexpr uint32_t blockStride = 4; - - /** - * @brief Hash function to provide entropy for scattering memory requests. - * - * @param numBytes Number of bytes requested. - * @return A hash value. - */ - // TAcc is to be deduced, so we put it last. - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto hash(TAcc const& acc, uint32_t const numBytes) -> uint32_t - { - uint32_t const relative_offset = warpSize * numBytes / T_pageSize; - return ( - numBytes * hashingK + hashingDistMP * smid(acc) - + (hashingDistWP + hashingDistWPRel * relative_offset) * warpid(acc)); - } - - private: - static constexpr uint32_t hashingK = 38183; - static constexpr uint32_t hashingDistMP = 17497; - static constexpr uint32_t hashingDistWP = 1; - static constexpr uint32_t hashingDistWPRel = 1; - }; - - /** - * @class InitKernel - * @brief Kernel to initialise the heap memory. - * - * Used by the creation policy during initialisation. - */ - struct InitKernel - { - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()( - auto const& /*unused*/, - Heap* m_heap, - void* m_heapmem, - size_t const m_memsize) const - { - m_heap->accessBlocks - = static_cast::MyAccessBlock*>(m_heapmem); - m_heap->heapSize = m_memsize; - m_heap->init(); - } - }; - -} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc - -namespace mallocMC::CreationPolicies -{ - /** - * @class FlatterScatter - * @brief A creation policy scattering memory requests in a flat hierarchy. - * - * This creation policy is a variation on the original ScatterAlloc algorithm and the one previously implemented in - * mallocMC. It provides a multi-level hierarchy of Heap, AccessBlock and DataPage that is traversed using the - * metadata held by each level to find a suitable memory location to satisfy the request. - * - * It uses a externally provided hash function to compute a single hash value for each request given its requested - * number of bytes and the accelerator. This is internally used to scatter the requests over the available memory - * and thereby improve the success rate for multi-threaded requests because different threads will start searching - * in different locations. - * - * Implemented as a thin wrapper around `Heap` that mainly provides interface compatibility with the calling code. - */ - template - struct FlatterScatterImpl - { - template - using AlignmentAwarePolicy = FlatterScatterAlloc::Heap; - - static auto classname() -> std::string - { - return "FlatterScatter"; - } - - static constexpr auto const providesAvailableSlots = true; - - /** - * @brief Check if a pointer returned from `create()` signals out-of-memory. - * - * @param pointer Pointer returned by `create()`. - * @return The boolean answer to this question. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isOOM(void* pointer, uint32_t const /*unused size*/) -> bool - { - return pointer == nullptr; - } - - /** - * @brief initialise a raw piece of memory for use by the `Heap`. - * - * @param dev The alpaka device. - * @param queue The alpaka queue. - * @param heap The pointer to the `Heap` object located on the device. - * @param pool The pointer to the provided memory pool to be used by the `Heap` object. - * @param memsize The size of the pool memory in bytes. - */ - template - static void initHeap(auto& dev, auto& queue, auto* heap, void* pool, size_t memsize) - { - using Dim = typename alpaka::trait::DimType::type; - using Idx = typename alpaka::trait::IdxType::type; - using VecType = alpaka::Vec; - - auto workDivSingleThread - = alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}; - alpaka::exec(queue, workDivSingleThread, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize); - alpaka::wait(queue); - } - - /** - * @brief Count the number of possible allocation for the given slotSize directly from the host. - * - * This method implements the infrastructure to call `getAvailableSlotsDeviceFunction` on the `Heap` class. See - * there for details, particularly concerning the thread-safety of this. - * - * @param dev The alpaka device. - * @param queue The alpaka queue. - * @param slotSize The would-be-created memory size in number of bytes. - * @param heap Pointer to the `Heap` object that's supposed to handle the request. - * @return The number of allocations that would be successful with this slotSize. - */ - template - static auto getAvailableSlotsHost( - AlpakaDevice& dev, - AlpakaQueue& queue, - uint32_t const slotSize, - T_DeviceAllocator* heap) -> unsigned - { - using Dim = typename alpaka::trait::DimType::type; - using Idx = typename alpaka::trait::IdxType::type; - using VecType = alpaka::Vec; - - auto d_slots = alpaka::allocBuf(dev, uint32_t{1}); - alpaka::memset(queue, d_slots, 0, uint32_t{1}); - auto d_slotsPtr = alpaka::getPtrNative(d_slots); - - auto getAvailableSlotsKernel = [heap, slotSize, d_slotsPtr] ALPAKA_FN_ACC(AlpakaAcc const& acc) -> void - { *d_slotsPtr = heap->getAvailableSlotsDeviceFunction(acc, slotSize); }; - - alpaka::wait(queue); - alpaka::exec( - queue, - alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}, - getAvailableSlotsKernel); - alpaka::wait(queue); - - auto const platform = alpaka::Platform{}; - auto const hostDev = alpaka::getDevByIdx(platform, 0); - - auto h_slots = alpaka::allocBuf(hostDev, Idx{1}); - alpaka::memcpy(queue, h_slots, d_slots); - alpaka::wait(queue); - - return *alpaka::getPtrNative(h_slots); - } - }; - - template< - typename T_HeapConfig = FlatterScatterAlloc::DefaultHeapConfig<>, - typename T_HashConfig = FlatterScatterAlloc::DefaultFlatterScatterHashConfig, - typename T_AlignmentPolicy = void> - struct FlatterScatter - { - template - using AlignmentAwarePolicy = FlatterScatterImpl; - - struct Properties - { - using HeapConfig = T_HeapConfig; - using HashConfig = T_HashConfig; - }; - }; - - -} // namespace mallocMC::CreationPolicies diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp deleted file mode 100644 index 70e60bf7fd..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp +++ /dev/null @@ -1,823 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" -#include "mallocMC/mallocMC_utils.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace mallocMC::CreationPolicies::FlatterScatterAlloc -{ - - /** - * @class PageTable - * @brief Storage for AccessBlock's metadata - */ - template - struct PageTable - { - uint32_t chunkSizes[T_numPages]{}; - uint32_t fillingLevels[T_numPages]{}; - - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanup() -> void - { - std::fill(std::begin(chunkSizes), std::end(chunkSizes), 0U); - std::fill(std::begin(fillingLevels), std::end(fillingLevels), 0U); - } - }; - - /** - * @class Padding - * @brief Empty memory to pad the AccessBlock to the correct size - */ - template - struct Padding - { - char padding[T_size]{}; - }; - - /** - * @brief The C++ standard disallows zero-size arrays, so we specialise for this case. - */ - template<> - struct Padding<0U> - { - }; - - /** - * @class AccessBlock - * @brief Coarsest memory division unit containing fixed-size pages of raw memory and metadata about their chunk - * size and filling level - * - * @tparam T_HeapConfig A struct with compile-time information about the setup - * @tparam T_AlignmentPolicy The alignment policy in use for optimisation purposes - */ - template - class AccessBlock - { - protected: - static constexpr uint32_t const blockSize = T_HeapConfig::accessblocksize; - static constexpr uint32_t const pageSize = T_HeapConfig::pagesize; - static constexpr uint32_t const wasteFactor = T_HeapConfig::wastefactor; - static constexpr bool const resetfreedpages = T_HeapConfig::resetfreedpages; - - using MyPageInterpretation = PageInterpretation; - - // This class is supposed to be reinterpeted on a piece of raw memory and not instantiated directly. We set it - // protected, so we can still test stuff in the future easily. - AccessBlock() - { - init(); - } - - public: - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void - { - pageTable.cleanup(); - constexpr uint32_t dummyChunkSize = 1U; - for(auto& page : pages) - { - MyPageInterpretation(page, dummyChunkSize).cleanupFull(); - } - } - - /** - * @brief Compute the number of pages in the access block taking into account the space needed for metadata. - * - * @return The number of pages in the access block. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numPages() -> uint32_t - { - constexpr auto numberOfPages = blockSize / (pageSize + sizeof(PageTable<1>)); - // check that the page table entries does not have a padding - static_assert(sizeof(PageTable) == numberOfPages * sizeof(PageTable<1>)); - return numberOfPages; - } - - /** - * @brief Answers the question: How many successful allocations with the given size are still possible? - * CAUTION: Not thread-safe! - * - * This method looks up the metadata for all its pages and computes the number of available slots with the - * given chunk size. By doing so, the information this method is queried for is inherently not thread-safe - * because if other threads are (de-)allocating memory during this look up, the information about each - * individual page will be stale as soon as it is retrieved. However, beyond this inherent non-thread-safety we - * made no effort so far to leverage parallelism or make it use atomics, i.e., move into the direction of - * consistency in the multi-threaded case. It is supposed to run in a single thread without any interference. - * - * @param chunkSize The number of bytes the would-be allocations request - * @return The number of available slots with this chunk size. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlots(auto const& acc, uint32_t const chunkSize) const - -> uint32_t - { - if(chunkSize < multiPageThreshold()) - { - return getAvailableChunks(acc, chunkSize); - } - return getAvailableMultiPages(acc, chunkSize); - } - - /** - * @brief Compute the index of the page a pointer points to. - * - * @param pointer Memory location inside of the data part of this access block. - * @return The index of the page this pointer points to. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto pageIndex(void* pointer) const -> int32_t - { - return indexOf(pointer, pages, pageSize); - } - - /** - * @brief Verifies that a pointer points to a valid piece of memory. CAUTION: Not thread-safe! - * - * This method checks if a pointer is valid, meaning that it points to a chunk of memory that is marked as - * allocated. The information it provides is inherently not thread-safe because if other threads are operating - * on the memory, the retrieved information is stale the moment it was looked up. It is, however, consistent in - * that it uses atomics to retrieve this information, so if the pointer is valid and does not get destroyed - * between looking up the answer and using it (for example in the scenario where I'm the only one knowing about - * this pointer), the answer is valid. - * - * @param pointer Pointer to validate - * @return true if the pointer is valid else false - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) -> bool - { - if(pointer == nullptr) - { - return false; - } - auto const index = pageIndex(pointer); - auto chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); - if(chunkSize >= pageSize) - { - return true; - } - return chunkSize == 0U or atomicLoad(acc, pageTable.fillingLevels[index]) == 0U - ? false - : interpret(index, chunkSize).isValid(acc, pointer); - } - - /** - * @brief Allocate a piece of memory of the given size. - * - * This method attempts to allocate a piece of memory of (at least) numBytes bytes. The actual size might be - * larger (depending on the user-provided compile-time configuration of the AccessBlock) but is not - * communicated, so it is not allowed to access the pointer outside the requested range. It returns a nullptr - * if there is no memory available. The hashValue is used to scatter memory accesses. A cheap operation will be - * performed to transform it into a page index to start the search at. It is also handed to the lower levels to - * be used similarly. Having it default to 0 makes it easier for testing. The effect of this method is reverted - * by the destroy method. - * - * @param numBytes Required size of memory in bytes - * @param hashValue Optional number to scatter memory access. - * @return A pointer to an allocated piece of memory or nullptr if no memory is available - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create( - TAcc const& acc, - uint32_t const numBytes, - uint32_t const hashValue = 0U) -> void* - { - void* pointer{nullptr}; - if(numBytes >= multiPageThreshold()) - { - pointer = createOverMultiplePages(acc, numBytes, hashValue); - } - else - { - pointer = createChunk(acc, numBytes, hashValue); - } - return pointer; - } - - /** - * @brief Free up the memory a valid pointer points to. - * - * This method attempts to destroy the memory of a valid pointer created by the create method. It reverses the - * effect of the create method and makes the allocated memory available for re-allocation. After calling this - * method on a pointer it is invalid and may no longer be used for memory access. Invalid pointers are ignored - * and a failure of this method is not communicated in production. In debug mode various exceptions can be - * thrown for different forms of invalid pointers. - * - * @param pointer A pointer created by the create method. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* const pointer) -> void - { - auto const index = pageIndex(pointer); - if(index >= static_cast(numPages()) || index < 0) - { -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - throw std::runtime_error{ - "Attempted to destroy an invalid pointer! Pointer does not point to any page."}; -#endif // NDEBUG - return; - } - auto const chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); - if(chunkSize >= multiPageThreshold()) - { - destroyOverMultiplePages(acc, index, chunkSize); - } - else - { - destroyChunk(acc, pointer, index, chunkSize); - } - } - - private: - DataPage pages[numPages()]{}; - PageTable pageTable{}; - Padding padding{}; - - /** - * @brief The number of bytes at which allocation switch to "multi-page mode", i.e., allocate full pages. - * - * It is obvious that this number can be at most page size subtracted by the size of one bit mask. There is, - * however, no strict lower bound because we theoretically disregard the lower levels completely by this - * switch. If we reasonably assume that our lower hierarchy levels add value (i.e. performance) to our - * implementation, a reasonable lower bound would be the size at which only a single allocation fits onto a - * page. This method could be used for fine-tuning performance in that sense. - * - * @return The number of bytes at which to switch to multi-page mode. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto multiPageThreshold() -> uint32_t - { - return ceilingDivision(pageSize - sizeof(BitMaskStorageType<>), 2U); - } - - /** - * @brief Convenience method that creates a PageInterpretation from a page identified by its page index and a - * chunk size. - * - * @param pageIndex Identifies the page in the array of raw pages. - * @param chunkSize Chunk size for which to interpret the page. - * @return A page interpretation of the requested page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto interpret(uint32_t const pageIndex, uint32_t const chunkSize) - { - return MyPageInterpretation(pages[pageIndex], chunkSize); - } - - /** - * @brief Branch of getAvailableSlots for chunk sizes below the multi-page threshold. See there for details. - * - * @param chunkSize Would-be allocation size to test for. - * @return Number of allocations that would succeed with this size. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableChunks(auto const& acc, uint32_t const chunkSize) const - -> uint32_t - { - // TODO(lenz): This is not thread-safe! - return std::transform_reduce( - std::cbegin(pageTable.chunkSizes), - std::cend(pageTable.chunkSizes), - std::cbegin(pageTable.fillingLevels), - 0U, - std::plus{}, - [this, &acc, chunkSize](auto const localChunkSize, auto const fillingLevel) - { - auto const numChunks - = MyPageInterpretation::numChunks(localChunkSize == 0 ? chunkSize : localChunkSize); - return ((this->isInAllowedRange(acc, localChunkSize, chunkSize) or localChunkSize == 0U) - and fillingLevel < numChunks) - ? numChunks - fillingLevel - : 0U; - }); - } - - /** - * @brief Branch of getAvailableSlots for chunk sizes above the multi-page threshold. See there for details. - * - * @param chunkSize Would-be allocation size to test for. - * @return Number of allocations that would succeed with this size. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableMultiPages(auto const& /*acc*/, uint32_t const chunkSize) const - -> uint32_t - { - // TODO(lenz): This is not thread-safe! - auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); - if(numPagesNeeded > numPages()) - { - return 0U; - } - uint32_t sum = 0U; - for(uint32_t i = 0; i < numPages() - numPagesNeeded + 1;) - { - if(std::all_of( - pageTable.chunkSizes + i, - pageTable.chunkSizes + i + numPagesNeeded, - [](auto const& val) { return val == 0U; })) - { - sum += 1; - i += numPagesNeeded; - } - else - { - ++i; - } - } - return sum; - } - - /** - * @brief Creation algorithm in multi-page mode. - * - * In this mode, we have decided to ignore all the lower level hierarchy. The algorithm simplifies accordingly - * and a few optimisations can be done. It can however be quite cumbersome to find a sufficient number of - * contiguous pages, so this will likely be most performant for small sizes. - * - * @param numBytes Required allocation size in number of bytes. - * @param hashValue A hash value used to scatter memory access. - * @return Pointer to a valid piece of memory or nullptr if no such memory was found. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createOverMultiplePages( - auto const& acc, - uint32_t const numBytes, - uint32_t hashValue) -> void* - { - auto numPagesNeeded = ceilingDivision(numBytes, +pageSize); - if(numPagesNeeded > numPages()) - { - return static_cast(nullptr); - } - - // We take a little head start compared to the chunked case in order to not have them interfere with our - // laborious search for contiguous pages. - auto startIndex = startPageIndex(acc, hashValue) + numPagesNeeded; - return wrappingLoop( - acc, - startIndex, - numPages() - (numPagesNeeded - 1), - static_cast(nullptr), - [&](auto const& localAcc, auto const& firstIndex) - { - void* result{nullptr}; - auto numPagesAcquired = acquirePages(localAcc, firstIndex, numPagesNeeded); - if(numPagesAcquired == numPagesNeeded) - { - // At this point, we have acquired all the pages we need and nobody can mess with them anymore. - // We still have to set the chunk size correctly. - setChunkSizes(localAcc, firstIndex, numPagesNeeded, numBytes); - result = &pages[firstIndex]; - } - else - { - releasePages(localAcc, firstIndex, numPagesAcquired); - } - return result; - }); - } - - /** - * @brief Short-circuiting acquisition of multiple contiguous pages. - * - * The algorithm attempts to acquire the requested number of pages starting from firstIndex locking them by - * setting their filling level to page size. It returns when either all requested pages are acquired or an - * already occupied page was hit. In either case, it returns the number of successful acquisitions. This method - * does not clean up after itself, i.e., it does not release the pages in case of failure. - * - * @param firstIndex Start index of the array of contiguous pages. - * @param numPagesNeeded Number of pages to be acquired. - * @return Number of pages that were successfully acquired. This is smaller than numPagesNeeded if the method - * failed. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto acquirePages( - auto const& acc, - uint32_t const firstIndex, - uint32_t const numPagesNeeded) -> uint32_t - { - uint32_t index = 0U; - uint32_t oldFilling = 0U; - for(index = 0U; index < numPagesNeeded; ++index) - { - oldFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[firstIndex + index], 0U, +pageSize); - if(oldFilling != 0U) - { - break; - } - } - return index; - } - - /** - * @brief Counterpart to acquirePages for doing the clean-up in case of failure. - * - * This method starts from page firstIndex and releases the lock of numPagesAcquired contiguous pages. This is - * supposed to be called in the case of failure of acquirePages to release the already acquired pages. - * - * @param firstIndex Start index of the array of contiguous pages. - * @param numPagesAcquired Number of pages to be released. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto releasePages( - auto const& acc, - uint32_t const firstIndex, - uint32_t const numPagesAcquired) -> void - { - for(uint32_t index = 0U; index < numPagesAcquired; ++index) - { - alpaka::atomicSub(acc, &pageTable.fillingLevels[firstIndex + index], +pageSize); - } - } - - /** - * @brief Set the chunk sizes of a contiguous array of pages. - * - * This function assumes that all the pages are locked by the current thread and performs a hard set operation - * without checking the previous content. - * - * @param firstIndex Start index of the contiguous array of pages. - * @param numPagesNeeded The number of pages to set the chunk size on. - * @param numBytes Chunk size to be set in number of bytes. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto setChunkSizes( - auto const& acc, - uint32_t const firstIndex, - uint32_t const numPagesNeeded, - uint32_t const numBytes) -> void - { - for(uint32_t numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired) - { - // At this point in the code, we have already locked all the pages. So, we literally don't care what - // other threads thought this chunk size would be because we are the only ones legitimately messing - // with this page. This chunk size may be non-zero because we could have taken over a page before it - // was properly cleaned up. That is okay for us because we're handing out uninitialised memory anyways. - // But it is very important to record the correct chunk size here, so the destroy method later on knows - // how to handle this memory. - alpaka::atomicExch(acc, &pageTable.chunkSizes[firstIndex + numPagesAcquired], numBytes); - } - } - - /** - * @brief Special return value for an unsuccessful search of available pages. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreePageFound() - { - return numPages(); - } - - /** - * @brief Compute an index where to start searching for a free page from a hash value. - * - * @param hashValue Hash value to introduce some entropy here. - * @return Start index for searching a free page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startPageIndex(auto const& /*acc*/, uint32_t const hashValue) - { - return (hashValue >> 8U) % numPages(); - } - - /** - * @brief Helper that combines the necessary checks to ensure a page index is valid. - * - * @param index The page index to check. - * @return true if the page index is valid and false otherwise - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValidPageIdx(uint32_t const index) const -> bool - { - return index != noFreePageFound() && index < numPages(); - } - - /** - * @brief Main algorithm to create a chunk of memory on a page. - * - * This is the main algorithm for creating a chunk of memory. It searches for a free page and instructs it to - * create some memory. If successful, it returns this pointer. If not, it searches on. - * - * @param numBytes Number of bytes required. - * @param hashValue A hash value used to scatter the memory accesses. - * @return A pointer to a valid piece of memory or nullptr if no available memory could be found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createChunk( - TAcc const& acc, - uint32_t const numBytes, - uint32_t const hashValue) -> void* - { - auto index = startPageIndex(acc, hashValue); - - // Under high pressure, this loop could potentially run for a long time because the information where and - // when we started our search is not maintained and/or used. This is a feature, not a bug: Given a - // consistent state, the loop will terminate once a free chunk is found or when all chunks are filled for - // long enough that `choosePage` could verify that each page is filled in a single run. - // - // The seemingly non-terminating behaviour that we wrap around multiple times can only occur (assuming a - // consistent, valid state of the data) when there is high demand for memory such that pages that appear - // free to `choosePage` are repeatedly found but then the free chunks are scooped away by other threads. - // - // In the latter case, it is considered desirable to wrap around multiple times until the thread was fast - // enough to acquire some memory. - void* pointer = nullptr; - do - { - // TODO(lenz): This can probably be index++. - index = (index + 1) % numPages(); - uint32_t chunkSize = numBytes; - index = choosePage(acc, numBytes, index, chunkSize); - if(isValidPageIdx(index)) - { - pointer = MyPageInterpretation{pages[index], chunkSize}.create(acc, hashValue); - if(pointer == nullptr) - { - leavePage(acc, index); - } - } - } while(isValidPageIdx(index) and pointer == nullptr); - return pointer; - } - - /** - * @brief Main loop running over all pages checking for available ones. - * - * It is important to stress that the information about availability of the returned page is already stale when - * it is returned. Thus, it can well happen that an actual allocation attempt on this page still fails, e.g., - * because another thread was faster and scooped away that piece of memory. - * - * @param numBytes Required allocation size in number of bytes. - * @param startIndex Index of the page to start the search from. - * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for - * optimisation by reducing the number of atomic lookups. - * @return A page index to a potntially available page or noFreePageFound() if none was found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto choosePage( - TAcc const& acc, - uint32_t const numBytes, - uint32_t const startIndex, - uint32_t& chunkSizeCache) -> uint32_t - { - return wrappingLoop( - acc, - startIndex, - numPages(), - noFreePageFound(), - [this, numBytes, &chunkSizeCache](auto const& localAcc, auto const index) { - return this->thisPageIsSuitable(localAcc, index, numBytes, chunkSizeCache) ? index - : noFreePageFound(); - }); - } - - /** - * @brief Helper function combining checks to match the requested number of bytes with a found chunk size - * taking into account the waste factor. - * - * @param chunkSize Actually found chunk sizes of a page in number of bytes - * @param numBytes Requested allocation size in number of bytes. - * @return - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isInAllowedRange( - auto const& acc, - uint32_t const chunkSize, - uint32_t const numBytes) const - { - return T_HeapConfig::isInAllowedRange(acc, chunkSize, numBytes); - } - - /** - * @brief Checks if a page is usable for allocation of numBytes and enters it. - * - * This method looks up the metdata of the page identified by its index to check if we can hope for a - * successful allocation there. In doing so, it enters the page (i.e. increments its filling level) and, if - * necessary, already sets the correct chunk size. In a multi-threaded context the separate concerns of - * checking and setting cannot be split because the information used for the check would already be stale at - * the time of setting anything. If it returns true, the filling level and chunk sizes are thus suitable for - * proceeding further and the caller is responsible for cleaning up appropriately if a failure at a later stage - * occurs. If it returns false, it has already cleaned up everything itself and there is no further action - * required on the caller's side. - * - * @param index Index to identify the page among the raw data pages. - * @param numBytes Requested allocation size in number of bytes. - * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for - * optimisation by reducing the number of atomic lookups. - * @return true if the page is suitable and false otherwise - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto thisPageIsSuitable( - TAcc const& acc, - uint32_t const index, - uint32_t const numBytes, - uint32_t& chunkSizeCache) -> bool - { - bool suitable = false; - auto oldFilling = enterPage(acc, index); - - // At this point, we're only testing against our desired `numBytes`. Due to the `wastefactor` the actual - // `chunkSize` of the page might be larger and, thus, the actual `numChunks` might be smaller than what - // we're testing for here. But if this fails already, we save one atomic. - if(oldFilling < MyPageInterpretation::numChunks(numBytes)) - { - uint32_t oldChunkSize = alpaka::atomicCas(acc, &pageTable.chunkSizes[index], 0U, numBytes); - chunkSizeCache = oldChunkSize == 0U ? numBytes : oldChunkSize; - - // Now that we know the real chunk size of the page, we can check again if our previous assessment was - // correct. But first we need to make sure that we are actually in chunked mode. This will be redundant - // with the second check in most situations because we usually would choose a multi-page threshold that - // would not switch to multi-page mode while more than one chunk fits on the page but this is a design - // decision that could change in the future. - if(oldChunkSize < multiPageThreshold() - and oldFilling < MyPageInterpretation::numChunks(chunkSizeCache)) - { - suitable = isInAllowedRange(acc, chunkSizeCache, numBytes); - } - } - if(not suitable) - { - leavePage(acc, index); - } - return suitable; - } - - /** - * @brief Counterpart to createChunk freeing up a piece of memory in the chunked mode. See destroy for details. - * - * This is the most difficult part of the algorithm. We will successively remove our metadata from the various - * levels and must be extra careful which information we can still rely on. Most of this complexity is captured - * in leavePage. - * - * @param pointer Pointer to a valid piece of memory created by createChunk. - * @param pageIndex Index of the page the pointer points to. Supplying this is an optimisation because it was - * already computed on a higher level in the call stack. This information would already be contained in - * pointer. - * @param chunkSize Chunk size of the page we're operating on. This is potentially different from the size of - * memory the pointer points to due to the waste factor. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyChunk( - TAcc const& acc, - void* pointer, - uint32_t const pageIndex, - uint32_t const chunkSize) - { - auto page = interpret(pageIndex, chunkSize); - page.destroy(acc, pointer); - leavePage(acc, pageIndex); - } - - /** - * @brief Enter a page for any purpose. - * - * This method is very important. We maintain the invariant that any potentially writing access to a page - * starts by entering and ends by leaving a page. These are currently implemented as updating the filling level - * accordingly. You are not allowed to touch a page unless you have entered it (although multi-page mode uses a - * shortcut here). This implies that we always have to check the filling level before checking for the chunk - * size. - * - * @param pageIndex Identifies the page in the array of raw data pages. - * @return The old filling level for further checks. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto enterPage(TAcc const& acc, uint32_t const pageIndex) -> uint32_t - { - auto const oldFilling = alpaka::atomicAdd(acc, &pageTable.fillingLevels[pageIndex], 1U); - // We assume that this page has the correct chunk size. If not, the chunk size is either 0 (and oldFilling - // must be 0, too) or the next check will fail. - return oldFilling; - } - - /** - * @brief Leave a page after any potentially modifying operation on it. - * - * This method must be called whenever you have entered a page (using enterPage()). This is a very subtle and - * error-prone method because we are successively removing metadata and need to be extra careful which - * information and guards we can still trust. In the simplest case, there's not much to do but decrease the - * filling level but potentially we're the last thread on the page and need to clean up remaining metadata for - * the threads to come. In that case, we explicitly allow for threads to take over the page as-is to spare us - * the trouble of cleaning up. But doing so opens up many subtle ways of reordering memory accesses. Also, we - * cannot rely in much previous information (like chunk sizes looked up earlier) because other threads might - * have already updated them. Be warned! - * - * @param pageIndex Identifies the page in the array of raw data pages. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC void leavePage(TAcc const& acc, uint32_t const pageIndex) - { - // This outermost atomicSub is an optimisation: We can fast-track this if we are not responsible for the - // clean-up. Using 0U -> 1U in the atomicCAS and comparison further down would have the same effect (if the - // else branch contained the simple subtraction). It's a matter of which case shall have one operation - // less. - auto originalFilling = alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], 1U); - - if constexpr(resetfreedpages) - { - if(originalFilling == 1U) - { - // CAUTION: This section has caused a lot of headaches in the past. We're in a state where the - // filling level is 0 but we have not properly cleaned up the page and the metadata yet. This is on - // purpose because another thread might still take over this page and spare us the trouble of - // freeing everything up properly. But this other thread must take into account the possibility - // that it acquired a second-hand page. Look here if you run into another deadlock. It might well - // be related to this section. - - auto lock = pageSize; - auto latestFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[pageIndex], 0U, lock); - if(latestFilling == 0U) - { - auto chunkSize = atomicLoad(acc, pageTable.chunkSizes[pageIndex]); - if(chunkSize != 0) - { - // At this point it's guaranteed that the fiilling level is numChunks and thereby locked. - // Furthermore, chunkSize cannot have changed because we maintain the invariant that the - // filling level is always considered first, so no other thread can have passed that - // barrier to reset it. - MyPageInterpretation{pages[pageIndex], chunkSize}.cleanupUnused(); - alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); - - // It is important to keep this after the clean-up line above: Otherwise another thread - // with a smaller chunk size might circumvent our lock and already start allocating before - // we're done cleaning up. - alpaka::atomicCas(acc, &pageTable.chunkSizes[pageIndex], chunkSize, 0U); - } - - // TODO(lenz): Original version had a thread fence at this point in order to invalidate - // potentially cached bit masks. Check if that's necessary! - - // At this point, there might already be another thread (with another chunkSize) on this page - // but that's fine. It won't see the full capacity but we can just subtract what we've added - // before: - alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], lock); - } - } - } - } - - /** - * @brief Counterpart to createOverMultiplePages, freeing up memory in multi-page mode. - * - * This method is way simpler than its chunked version because in multi-page mode we have a hard lock on the - * pages we acquired and are free to manipulate them to our will. We just make sure that releasing this lock is - * the last operation we perform. - * - * @param pageIndex Identifies the first page in the array of raw data pages. - * @param chunkSize The chunk size set on that first page (i.e. the original allocation size). - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyOverMultiplePages( - auto const& acc, - uint32_t const pageIndex, - uint32_t const chunkSize) - { - auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); - for(uint32_t i = 0; i < numPagesNeeded; ++i) - { - auto myIndex = pageIndex + i; - // Everything inside the following scope is done to reset the free'd pages. As opposed to the chunked - // case, we decided to always perform a reset in multi-page mode regardless of the value of - // `resetfreedpages`. If you want to reinstate the old behaviour or add a second parameter - // specifically for multi-page mode, e.g., resetreedpages_multipage, just put an `if constexpr` around - // here again. - { - MyPageInterpretation{pages[myIndex], T_AlignmentPolicy::Properties::dataAlignment}.cleanupFull(); - alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); - alpaka::atomicCas(acc, &pageTable.chunkSizes[myIndex], chunkSize, 0U); - } - alpaka::atomicSub(acc, &pageTable.fillingLevels[myIndex], +pageSize); - } - } - }; - -} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp deleted file mode 100644 index c7596c072d..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp +++ /dev/null @@ -1,533 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp" -#include "mallocMC/mallocMC_utils.hpp" - -#include -#include - -#include - -#include -#include -#include -#include - -namespace mallocMC::CreationPolicies::FlatterScatterAlloc -{ - namespace detail - { - template - struct BitMaskStorageTypes - { - using type = void; - }; - - template<> - struct BitMaskStorageTypes<16U> - { - using type = uint16_t; - }; - - template<> - struct BitMaskStorageTypes<32U> - { - using type = uint32_t; - }; - - template<> - struct BitMaskStorageTypes<64U> - { - using type = uint64_t; - }; - } // namespace detail - - /** - * @brief Number of bits in a bit mask. Most likely you want a power of two here. - */ - constexpr uint32_t const BitMaskSize = 32U; - - /** - * @brief Type to store the bit masks in. It's implemented as a template in order to facilitate changing the type - * depending on BitMaskSize. Use it with its default template argument in order to make your code agnostic of the - * number configured in BitMaskSize. (Up to providing a template implementation, of course.) - */ - template - using BitMaskStorageType = detail::BitMaskStorageTypes::type; - - /** - * @brief Represents a completely filled bit mask, i.e., all bits are one. - */ - template - static constexpr BitMaskStorageType const allOnes = std::numeric_limits>::max(); - - /** - * @brief Return the bit mask's underlying type with a single bit set (=1) at position index and all others unset - * (=0). - * - * @param index Position of the single bit set. - * @return Bit mask's underlying type with one bit set. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto singleBit(uint32_t const index) -> BitMaskStorageType - { - return BitMaskStorageType{1U} << index; - } - - /** - * @brief Return the bit mask's underlying type with all bits up to index from the right are set (=1) and all - * higher bits are unset (=0). - * - * @param index Number of set bits. - * @return Bit mask's underlying type with index bits set. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto allOnesUpTo(uint32_t const index) -> BitMaskStorageType - { - return index == 0 ? 0 : (allOnes >> (size - index)); - } - - /** - * @class BitMaskImpl - * @brief Represents a bit mask basically wrapping the BitMaskStorageType<>. - * - * This class basically provides a convenience interface to the (typically integer) type BitMaskStorageType<> for - * bit manipulations. It was originally modelled closely after std::bitset which is not necessarily available on - * device for all compilers, etc. - * - * Convention: We start counting from the right, i.e., if mask[0] == 1 and all others are 0, then mask = 0...01 - * - * CAUTION: This convention is nowhere checked and we might have an implicit assumption on the endianess here. We - * never investigated because all architectures we're interested in have the same endianess and it works on them. - * - */ - template - struct BitMaskImpl - { - BitMaskStorageType mask{}; - - /** - * @return An invalid bit index indicating the failure of a search in the bit mask. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreeBitFound() -> uint32_t - { - return MyBitMaskSize; - } - - /** - * @brief Look up if the index-th bit is set. - * - * @param index Bit position to check. - * @return true if bit is set else false. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto const index) -> bool - { - return (atomicLoad(acc, mask) & singleBit(index)) != BitMaskStorageType{0U}; - } - - /** - * @brief Set all bits (to 1). - * - * @return Previous mask. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc) -> BitMaskStorageType - { - return alpaka::atomicOr( - acc, - &mask, - static_cast>(+allOnes)); - } - - /** - * @brief Set the index-th bit (to 1). - * - * @param index Bit position to set. - * @return Previous mask. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc, auto const index) - { - return alpaka::atomicOr(acc, &mask, singleBit(index)); - } - - /** - * @brief Unset the index-th bit (set it to 0). - * - * @param index Bit position to unset. - * @return Previous mask. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto unset(TAcc const& acc, auto const index) - { - return alpaka::atomicAnd( - acc, - &mask, - static_cast>( - allOnes ^ singleBit(index))); - } - - /** - * @brief Flip all bits in the mask. - * - * @return Previous mask. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc) - { - return alpaka::atomicXor( - acc, - &mask, - static_cast>(+allOnes)); - } - - /** - * @brief Flip the index-th bits in the mask. - * - * @param index Bit position to flip. - * @return Previous mask. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc, auto const index) - { - return alpaka::atomicXor( - acc, - &mask, - static_cast>(singleBit(index))); - } - - /** - * @brief Compare with another mask represented by a BitMaskStorageType<>. CAUTION: This does not use atomics - * and is not thread-safe! - * - * This is not implemented thread-safe because to do so we'd need to add the accelerator as a function argument - * and that would not abide by the interface for operator==. It's intended use is to make (single-threaded) - * tests more readable, so that's not an issue. - * - * @param other Mask to compare with. - * @return true if all bits are identical else false. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator==(BitMaskStorageType const other) const -> bool - { - return (mask == other); - } - - /** - * @brief Spaceship operator comparing with other bit masks. CAUTION: This does not use atomics and is not - * thread-safe! See operator== for an explanation. - * - * @param other Bit mask to compare with. - * @return Positive if this mask > other mask, 0 for equality, negative otherwise. - */ - // My version of clang cannot yet handle the spaceship operator apparently: - // clang-format off - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator<=> (BitMaskImpl const other) const - // clang-format on - { - return (mask - other.mask); - } - - /** - * @brief Check if no bit is set (=1). - * - * @return true if no bit is set else false. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto none() const -> bool - { - return mask == 0U; - } - - /** - * @brief Interface to the main algorithm of finding a free bit. - * - * This algorithm searches for an unset bit and returns its position as an index (which is supposed to be - * translated into a corresponding chunk by the PageInterpretation). Upon doing so, it also sets this bit - * because in a multi-threaded context we cannot separate the concerns of retrieving information and acting on - * the information. It takes a start index that acts as an initial guess but (in the current implementation) it - * does not implement a strict wrapping loop as the other stages do because this would waste valuable - * information obtained from the collective operation on all bits in the mask. - * - * Additionally, it copes with partial masks by ignoring all bit positions beyond numValidBits. - * - * @param numValidBits Bit positions beyond this number will be ignored. - * @param initialGuess Initial guess for the first look up. - * @return Bit position of a free bit or noFreeBitFound() in the case of none found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( - TAcc const& acc, - uint32_t const numValidBits = MyBitMaskSize, - uint32_t const initialGuess = 0) -> uint32_t - { - return firstFreeBitWithInitialGuess(acc, initialGuess % MyBitMaskSize, numValidBits); - } - - private: - /** - * @brief Implementation of the main search algorithm. See the public firstFreeBit method for general details. - * This version assumes a valid range of the input values. - * - * @param initialGuess Initial guess for the first look up must be in the range [0;MyBitMaskSize). - * @param endIndex Maximal position to consider. Bits further out will be ignored. - * @return Bit position of a free bit or noFreeBitFound() in the case of none found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitWithInitialGuess( - TAcc const& acc, - uint32_t const initialGuess, - uint32_t const endIndex) -> uint32_t - { - auto result = noFreeBitFound(); - BitMaskStorageType oldMask = 0U; - - // This avoids a modulo that's not a power of two and is faster thereby. - auto const selectedStartBit = initialGuess >= endIndex ? 0U : initialGuess; - for(uint32_t i = selectedStartBit; i < endIndex and result == noFreeBitFound();) - { - oldMask = alpaka::atomicOr(acc, &mask, singleBit(i)); - if((oldMask & singleBit(i)) == 0U) - { - result = i; - } - - // In case of no free bit found, this will return -1. Storing it in a uint32_t will underflow and - // result in 0xffffffff but that's okay because it also ends the loop as intended. - i = alpaka::ffs(acc, static_cast>>(~oldMask)) - 1; - } - - return result; - } - }; - - using BitMask = BitMaskImpl; - - /** - * @class BitFieldFlat - * @brief Represents a (non-owning) bit field consisting of multiple bit masks. - * - * This class interprets a piece of memory as an array of bit masks and provides convenience functionality to act - * on them as a long array of bits. Most importantly, it provides an interface to find a free bit. It is a - * non-owning view of the memory! - * - * Please note, that methods usually (unless stated otherwise) refer to bits counting all bits from the start of - * the bit field, so if BitMask size is 32 and index=34=31+3, we're checking for the third bit of the second mask - * (if masks was a matrix this would be equivalent to: masks[1][2]). - * - */ - template - struct BitFieldFlatImpl - { - std::span> data; - - /** - * @brief Check if the index-th bit in the bit field is set (=1). - * - * @param index Bit position to check. - * @return true if bit is set else false. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto get(TAcc const& acc, uint32_t index) const -> bool - { - return data[index / MyBitMaskSize](acc, index % MyBitMaskSize); - } - - /** - * @brief Get the index-th mask NOT bit (counting in number of masks and not bits). - * - * @param index Position of the mask. - * @return Requested mask. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getMask(uint32_t const index) const -> BitMaskImpl& - { - return data[index]; - } - - /** - * @brief Set the index-th bit (to 1). - * - * @param index Position of the bit. - * @return - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC void set(TAcc const& acc, uint32_t const index) const - { - data[index / MyBitMaskSize].set(acc, index % MyBitMaskSize); - } - - /** - * @brief Counterpart to set, unsetting (to 0) to index-th bit. - * - * @tparam TAcc - * @param acc - * @param index - * @return - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC void unset(TAcc const& acc, uint32_t const index) const - { - data[index / MyBitMaskSize].unset(acc, index % MyBitMaskSize); - } - - /** - * @return Begin iterator to the start of the array of masks, iterating over masks NOT bits. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto begin() const - { - return std::begin(data); - } - - /** - * @return End iterator to the start of the array of masks, iterating over masks NOT bits. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto end() const - { - return std::end(data); - } - - /** - * @brief Count the number of masks. - * - * @return Number of masks. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numMasks() const - { - return data.size(); - } - - /** - * @brief Count the number of bits in the array of masks. - * - * This does not take into account if bits are valid or not, so this is always a multiple of the MyBitMaskSize - * currently. - * - * @return Number of bits. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBits() const - { - return numMasks() * MyBitMaskSize; - } - - /** - * @brief Main algorithm for finding and setting a free bit in the bit field. - * - * This iterates through the masks wrapping around from the given startIndex. The information of how many bits - * are valid is passed through the lower levels which automatically discard out of range results (accounting of - * partially filled masks). As always, we can't separate the concerns of retrieving information and acting on - * it in a multi-threaded context, so if a free bit is found it is immediately set. - * - * @param numValidBits Number of valid bits in the bit field (NOT masks, i.e. it's equal to numChunks() on the - * page). Should typically be a number from the range [MyBitMaskSize * (numMasks()-1) + 1, MyBitMaskSize * - * numMasks()) although other numbers shouldn't hurt. - * @param startIndex Bit mask (NOT bit) to start the search at. - * @return The index of the free bit found (and set) or noFreeBitFound() if none was found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( - TAcc const& acc, - uint32_t numValidBits, - uint32_t const startIndex = 0U) -> uint32_t - { - return wrappingLoop( - acc, - startIndex % numMasks(), - numMasks(), - noFreeBitFound(), - [this, numValidBits](TAcc const& localAcc, auto const index) - { - auto tmp = this->firstFreeBitAt(localAcc, numValidBits, index); - return tmp; - }); - } - - /** - * @return Special invalid bit index to indicate that no free bit was found. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBitFound() const -> uint32_t - { - return numBits(); - } - - private: - /** - * @return Position inside of a mask to start the search at. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startBitIndex() - { - return laneid(); - } - - /** - * @brief Helper function checking if we're in the last mask. - * - * @param numValidBits Number of valid bits in the bit field. The mask containing this bit is the last mask. - * @param maskIndex Index of the mask under consideration (NOT bit). - * @return true if the mask is the last mask else false. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isThisLastMask( - uint32_t const numValidBits, - uint32_t const maskIndex) - { - // >= in case index == numValidBits - MyBitMaskSize - return (maskIndex + 1) * MyBitMaskSize >= numValidBits; - } - - /** - * @brief Implementation of the main algorithm asking a mask of a free bit and checking if the answer is valid. - * - * @param numValidBits Number of valid bits in the bit field. - * @param maskIdx Index of the maks under consideration. - * @return Index of the free bit found IN THE BITFIELD (not only in the mask, so this value can be larger than - * MyBitMaskSize) or noFreeBitFound() if none was found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitAt( - TAcc const& acc, - uint32_t const numValidBits, - uint32_t const maskIdx) -> uint32_t - { - auto numValidBitsInLastMask = (numValidBits ? ((numValidBits - 1U) % MyBitMaskSize + 1U) : 0U); - auto indexInMask = getMask(maskIdx).firstFreeBit( - acc, - isThisLastMask(numValidBits, maskIdx) ? numValidBitsInLastMask : MyBitMaskSize, - startBitIndex()); - if(indexInMask < BitMaskImpl::noFreeBitFound()) - { - uint32_t freeBitIndex = indexInMask + MyBitMaskSize * maskIdx; - if(freeBitIndex < numValidBits) - { - return freeBitIndex; - } - } - return noFreeBitFound(); - } - }; - - using BitFieldFlat = BitFieldFlatImpl; -} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp deleted file mode 100644 index 9f20c7d001..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include - -namespace mallocMC::CreationPolicies::FlatterScatterAlloc -{ - /** - * @class DataPage - * @brief Raw piece of memory of size T_pageSize - */ - template - struct DataPage - { - char data[T_pageSize]{}; - }; -} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp deleted file mode 100644 index 3f0bf82c4c..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp +++ /dev/null @@ -1,343 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" -#include "mallocMC/mallocMC_utils.hpp" - -#include - -#include -#include -#include - -namespace mallocMC::CreationPolicies::FlatterScatterAlloc -{ - /** - * @class PageInterpretation - * @brief Represent our interpretation of a raw data page. - * - * This class takes a reference to a raw data page and a chunk size and provides an interface to this raw memory to - * use is as a data page filled with chunks and corresponding bit masks indicating their filling. It furthermore - * provides static helper functions that implement formulae not tied to a particular piece of memory like the - * number of chunks given a chunk sizes (and the implicit page size). - * - * @param data Raw data page reference. - * @param chunkSize Chunk sizes to interpret this memory with. - */ - template - struct PageInterpretation - { - private: - DataPage& data; - uint32_t const chunkSize; - - public: - ALPAKA_FN_INLINE ALPAKA_FN_ACC PageInterpretation(DataPage& givenData, uint32_t givenChunkSize) - : data(givenData) - , chunkSize(givenChunkSize) - { - } - - /** - * @brief Compute the number of chunks of the given size that would fit onto a page. - * - * This is not quite a trivial calculation because we have to take into account the size of the bit field at - * the end which itself depends on the number of chunks. Due to the quantisation into fixed-size bit masks we - * are in the realm of integer divisions and remainders here. - * - * This is a static version of the algorithm because there's no reference to the data at all. Convenience - * version of that uses the chunk size of an instance is provided below. - * - * @param chunkSize The chunk size to use for the calculation. - * @return Number of chunks that would fit on a page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numChunks(uint32_t const chunkSize) -> uint32_t - { - constexpr auto b = static_cast>(sizeof(BitMask)); - auto const numFull = T_pageSize / (BitMaskSize * chunkSize + b); - auto const leftOverSpace = T_pageSize - numFull * (BitMaskSize * chunkSize + b); - auto const numInRemainder = leftOverSpace > b ? (leftOverSpace - b) / chunkSize : 0U; - return numFull * BitMaskSize + numInRemainder; - } - - /** - * @brief Convenience method calling numChunks(chunkSize) with the currently set chunkSize. See there for - * details. - * - * @return Number of chunks that fit on this page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numChunks() const -> uint32_t - { - return numChunks(chunkSize); - } - - /** - * @brief Convert a chunk index into a pointer to that piece of memory. - * - * @param index Chunk index < numChunks(). - * @return Pointer to that chunk. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkPointer(uint32_t index) const -> void* - { - return reinterpret_cast(&data.data[index * chunkSize]); - } - - /** - * @brief Lightweight mangling of the hash into a start point for searching in the bit field. - * - * It is important to stress that this returns an index of a bit mask, not an individual bit's index. So, if - * the BitMaskSize is 32 and I have 64 chunks on the page, there are two bit masks and the return value is - * either 0 or 1, i.e. the search would start at the 0th or 32nd bit. - * - * @param hashValue Number providing some entropy for scattering memory accesses. - * @return Index of a bit mask to start searching at. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBitMaskIndex(uint32_t const hashValue) const - { - return (hashValue >> 16); - } - - /** - * @brief Main allocation algorithm searching a free bit in the bit mask and returning the corresponding - * pointer to a chunk. - * - * @param hashValue Number providing some entropy for scattering memory accesses. - * @return Pointer to a valid piece of memory or nullptr if none was found. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const hashValue = 0U) -> void* - { - auto field = bitField(); - auto const index = field.firstFreeBit(acc, numChunks(), startBitMaskIndex(hashValue)); - return (index < field.noFreeBitFound()) ? chunkPointer(index) : nullptr; - } - - /** - * @brief Counterpart to create, freeing an allocated pointer's memory. - * - * In production, this does not check the validity of the pointer and providing an invalid pointer is undefined - * behaviour. This includes valid pointers to outside the range of this page, obviously. - * - * @param pointer Pointer to a piece of memory created from the create method. - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* pointer) -> void - { - if(chunkSize == 0) - { -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - throw std::runtime_error{ - "Attempted to destroy a pointer with chunkSize==0. Likely this page was recently " - "(and potentially pre-maturely) freed."}; -#endif // NDEBUG - return; - } - auto chunkIndex = chunkNumberOf(pointer); -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - if(not isValid(acc, chunkIndex)) - { - throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " - "to a valid chunk or it is not marked as allocated."}; - } -#endif // NDEBUG - bitField().unset(acc, chunkIndex); - } - - /** - * @brief Convenience method to retrieve the configured minimal chunk size. - * - * @return Minimal possible chunk size of the page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto minimalChunkSize() -> uint32_t - { - return T_minimalChunkSize; - } - - /** - * @brief Clean up the full bit field region. - * - * This method is supposed to be used on raw memory and cleans up the maximal possible bit field region without - * assuming anything about its previous content. It is supposed to be used during initialisation of raw memory - * and after leaving a page in multi-page mode when arbitrary data is potentially found in that region. There - * is a further optimised version of clean-up for cases where this page was in use in chunked mode before. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupFull() -> void - { - PageInterpretation(data, minimalChunkSize()).resetBitField(); - } - - /** - * @brief Clean up previously unused parts of the bit field region. - * - * This method is supposed to have the same effect as cleanupFull but only on pages that are already in use in - * chunked mode. Due to this additional assumption we can conclude that the part that currently acts as bit - * field is already nulled (because we're the last ones on the page about to clean up, so all bits are unset). - * This significantly reduces the size of the region that needs cleaning if a small chunk size was set - * previously. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupUnused() -> void - { - auto worstCasePage = PageInterpretation(data, minimalChunkSize()); - memset( - static_cast(worstCasePage.bitFieldStart()), - 0U, - worstCasePage.bitFieldSize() - bitFieldSize()); - } - - /** - * @brief Reset the currently used bit field to 0. - * - * This was introduced to be called on pages interpreted with the minimal chunk size to fully clean up the bit - * field region. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto resetBitField() -> void - { - // This method is not thread-safe by itself. But it is supposed to be called after acquiring a "lock" in - // the form of setting the filling level, so that's fine. - - memset(static_cast(bitFieldStart()), 0U, bitFieldSize()); - } - - /** - * @brief Checks if a pointer points to an allocated chunk of memory on this page. - * - * This is not used in production and is not thread-safe in the sense that the information is stale as soon as - * it's returned. It is used in debug mode and can be used for (single-threaded) tests. - * - * @param pointer The pointer in question. - * @return true if the pointer points to an allocated chunk of memory, false otherwise - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) const -> bool - { - // This function is neither thread-safe nor particularly performant. It is supposed to be used in tests and - // debug mode. - return isValid(acc, chunkNumberOf(pointer)); - } - - private: - /** - * @brief Helper method for isValid(pointer) that acts on the level of the chunk's index which translates to - * the bit field position easier than the pointer. - * - * @param chunkIndex Index to a chunk to check. - * @return true if the chunk with this index is allocated, false otherwise - */ - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, int32_t const chunkIndex) const -> bool - { - return chunkIndex >= 0 and chunkIndex < static_cast(numChunks()) and isAllocated(acc, chunkIndex); - } - - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isAllocated(TAcc const& acc, uint32_t const chunkIndex) const -> bool - { - return bitField().get(acc, chunkIndex); - } - - public: - /** - * @brief Return the bit field of this page. - * - * @return Bit field of this page. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitField() const -> BitFieldFlat - { - return BitFieldFlat{{bitFieldStart(), ceilingDivision(numChunks(), BitMaskSize)}}; - } - - /** - * @brief Return a pointer to the first bit mask. - * - * @return Pointer to the first bit mask. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldStart() const -> BitMask* - { - return reinterpret_cast(&data.data[T_pageSize - bitFieldSize()]); - } - - /** - * @brief Convenience method to compute the bit field size of the current page. Forwards to its static version. - * See there for details. - * - * @return Size of this pages bit field in number of bytes. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldSize() const -> uint32_t - { - return bitFieldSize(chunkSize); - } - - /** - * @brief Compute the size of the bit field region in number of bytes for a page with the given chunk size. - * - * There is an instance method using the instance's chunk size for convenience. - * - * @param chunkSize Chunk size of the would-be page. - * @return Size of this pages bit field in number of bytes. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto bitFieldSize(uint32_t const chunkSize) -> uint32_t - { - return sizeof(BitMask) * ceilingDivision(numChunks(chunkSize), BitMaskSize); - } - - /** - * @brief Commpute the maximal possible size of the bit field in number of bytes. - * - * This is practically the bit field size of an instance with the minimaalChunkSize(). - * - * @return Maximal possible size of the bit field in number of bytes. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto maxBitFieldSize() -> uint32_t - { - return PageInterpretation::bitFieldSize(minimalChunkSize()); - } - - /** - * @brief Compute a chunk index given a pointer. - * - * Please note that this will return invalid indices for invalid input pointers. Be sure to guard against this - * if you don't want to risk messing up your memory. - * - * @param pointer A pointer interpreted to be pointing to a chunk of the current page. - * @return A valid index to a chunk on this page if the pointer was valid. A potentially negative number - * outside the valid range of chunk indices otherwise. - */ - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkNumberOf(void* pointer) const -> int32_t - { - return indexOf(pointer, &data, chunkSize); - } - - // these are supposed to be temporary objects, don't start messing around with them: - PageInterpretation(PageInterpretation const&) = delete; - PageInterpretation(PageInterpretation&&) = delete; - auto operator=(PageInterpretation const&) -> PageInterpretation& = delete; - auto operator=(PageInterpretation&&) -> PageInterpretation& = delete; - ~PageInterpretation() = default; - }; -} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp deleted file mode 100644 index d040bc128b..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - - -#include - -#include - -/** - * @brief Abstraction of a short-circuiting loop that wraps around from an arbitrary starting point within the range. - * - * This implements a re-occuring pattern in the code: Due to the scattering approach taken, we're often in a position - * where we want to run a simple loop except for the fact that we start in an arbitrary position within the range and - * complete it by wrapping around to the start of the range continuing from there. Furthermore, these loops are all - * searches, so it's advantageous to implement short-circuiting by early exit in case of finding another value than the - * provided failureValue. - * - * @tparam T_size Type of size-like arguments. This function is used in various contexts where this can either be - * size_t or uint32_t. - * @tparam TFunctor Type of the function representing the loop body (typically a lambda function). - * @tparam TArgs Types of additional arguments provided to the function. - * @param startIndex Index to start the loop at. - * @param size Size of the range which equals the number of iterations to be performed in total. - * @param failureValue Return value of the function indicating a failure of the current iteration and triggering the - * next iteration. - * @param func Function of type TFunctor representing the loop body. It is supposed to return a value of - * decltype(failureValue) and indicate failure by returning the latter. Any other value is interpreted as success - * triggering early exit of the loop. - * @param args Additional arguments to be provided to the function on each iteration. - * @return The return value of func which might be failureValue in case all iterations failed. - */ -template -ALPAKA_FN_INLINE ALPAKA_FN_ACC auto wrappingLoop( - TAcc const& acc, - T_size const startIndex, - T_size const size, - auto failureValue, - TFunctor func, - TArgs... args) -{ - for(uint32_t i = 0; i < size; ++i) - { - auto result = func(acc, (i + startIndex) % size, args...); - if(result != failureValue) - { - return result; - } - } - return failureValue; -} diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp deleted file mode 100644 index 2b78526e56..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "OldMalloc.hpp" - -#include - -#include - -namespace mallocMC -{ - namespace CreationPolicies - { - /** - * @brief classic malloc/free behaviour known from CUDA - * - * This CreationPolicy implements the classic device-side malloc and - * free system calls that is offered by CUDA-capable accelerator of - * compute capability 2.0 and higher - */ - class OldMalloc - { - using uint32 = std::uint32_t; - - public: - template - using AlignmentAwarePolicy = OldMalloc; - - static constexpr auto providesAvailableSlots = false; - - template - ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void* - { - return ::malloc(static_cast(bytes)); - } - - template - ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const - { - ::free(mem); - } - - ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool - { - return s != 0 && (p == nullptr); - } - - template - static void initHeap( - AlpakaDevice& dev, - AlpakaQueue& queue, - T_DeviceAllocator* heap, - void* pool, - size_t memsize) - { - } - - static auto classname() -> std::string - { - return "OldMalloc"; - } - }; - - } // namespace CreationPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp deleted file mode 100644 index db9e4dc57e..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp +++ /dev/null @@ -1,1404 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "../mallocMC_utils.hpp" - -#include -#include -#include - -#include -#include -#include /* uint32_t */ -#include -#include -#include -#include -#include -#include - -namespace mallocMC -{ - namespace CreationPolicies - { - namespace ScatterConf - { - struct DefaultScatterConfig - { - //! Size in byte of a page. - static constexpr auto pagesize = 4096; - /** Size in byte of an access block. - * - * Scatter alloc will keep allocations within an access block to reduce the translation lookaside - * buffer (tlb) pressure. accessblocksize can be used to optimize for the tlb of a device. - */ - static constexpr auto accessblocksize = 2u * 1024u * 1024u * 1024u; - //! Number of pages per region. - static constexpr auto regionsize = 16; - //! Factor used to calculate maximal allowed wast depending on the byte. - static constexpr auto wastefactor = 2; - /** Defines if a fully freed pages chunk size should be reset. - * - * true = Chunk size of a page will be reset if free. - * false = A page will keep the chunk size selected during the first page usage over - * the full application runtime. - */ - static constexpr auto resetfreedpages = false; - }; - - struct DefaultScatterHashingParams - { - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; - }; - } // namespace ScatterConf - - /** - * @brief fast memory allocation based on ScatterAlloc - * - * This CreationPolicy implements a fast memory allocator that trades - * speed for fragmentation of memory. This is based on the memory - * allocator "ScatterAlloc" - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), - * and is extended to report free memory slots of a given size (both on - * host and accelerator). To work properly, this policy class requires a - * pre-allocated heap on the accelerator and works only with Nvidia CUDA - * capable accelerators that have at least compute capability 2.0. - * - * @tparam T_Config (optional) configure the heap layout. The - * default can be obtained through Scatter<>::HeapProperties - * @tparam T_Hashing (optional) configure the parameters for - * the hashing formula. The default can be obtained through - * Scatter<>::HashingProperties - */ - template< - class T_Config = ScatterConf::DefaultScatterConfig, - class T_Hashing = ScatterConf::DefaultScatterHashingParams, - class T_AlignmentPolicy = void> - class ScatterImpl - { - public: - // TODO(lenz): This is a bit of a round trip due to a change of interface. A larger refactoring should - // remove this again. - template - using AlignmentAwarePolicy = ScatterImpl; - - using HeapProperties = T_Config; - using HashingProperties = T_Hashing; - - struct Properties - : HeapProperties - , HashingProperties - { - }; - - static constexpr auto providesAvailableSlots = true; - - private: - using uint32 = std::uint32_t; - -/** Allow for a hierarchical validation of parameters: - * - * shipped default-parameters (in the inherited struct) have lowest precedence. - * They will be overridden by a given configuration struct. However, even the - * given configuration struct can be overridden by compile-time command line - * parameters (e.g. -D MALLOCMC_CP_SCATTER_PAGESIZE 1024) - * - * default-struct < template-struct < command-line parameter - */ -#ifndef MALLOCMC_CP_SCATTER_PAGESIZE -# define MALLOCMC_CP_SCATTER_PAGESIZE (HeapProperties::pagesize) -#endif - static constexpr uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE; - -#ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE -# define MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE (HeapProperties::accessblocksize) -#endif - static constexpr size_t accessblocksize = MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE; - -#ifndef MALLOCMC_CP_SCATTER_REGIONSIZE -# define MALLOCMC_CP_SCATTER_REGIONSIZE (HeapProperties::regionsize) -#endif - static constexpr uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE; - -#ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR -# define MALLOCMC_CP_SCATTER_WASTEFACTOR (HeapProperties::wastefactor) -#endif - static constexpr uint32 wastefactor = MALLOCMC_CP_SCATTER_WASTEFACTOR; - -#ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES -# define MALLOCMC_CP_SCATTER_RESETFREEDPAGES (HeapProperties::resetfreedpages) -#endif - static constexpr bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES; - - public: - static constexpr uint32 _pagesize = pagesize; - static constexpr size_t _accessblocksize = accessblocksize; - static constexpr uint32 _regionsize = regionsize; - static constexpr uint32 _wastefactor = wastefactor; - static constexpr bool _resetfreedpages = resetfreedpages; - - private: -#if _DEBUG || ANALYSEHEAP - - public: -#endif - /* HierarchyThreshold defines the largest chunk size which can be stored in a segment with hierarchy. - * 32 chunks can be stored without an on page bitmask, therefore a hierarchy is only useful if we store at - * least 33 chunks. For 33 chunks we need two bitmasks, each 32bit. - */ - static constexpr uint32 HierarchyThreshold = (pagesize - 2u * sizeof(uint32)) / 33u; - /* Calculate minimal chunk size which can fill a page, this avoids that small allocations - * fragment the heap and increases the possibility that a small allocation can reuse an - * existing chunk. - * Each page can have 32x32 chunks. To maintain 32 chunks we need 32 bitmask on the page (each 32bit) - * - * @note: There is no requirement that minChunksSize is a power of two. - */ - static constexpr uint32 minChunkSize = (pagesize - 32u * sizeof(uint32)) / (32u * 32u); - static constexpr uint32 minSegmentSize = 32u * minChunkSize + sizeof(uint32); - // Number of possible on page masks without taking the limit of 32 masks into account. - static constexpr uint32 onPageMasks - = minChunkSize > HierarchyThreshold ? 0u : (pagesize + (minSegmentSize - 1u)) / minSegmentSize; - // The scatter malloc hierarchy design allows only 32 on page bit masks. - static constexpr uint32 maxOnPageMasks = std::min(32u, onPageMasks); - -#ifndef MALLOCMC_CP_SCATTER_HASHINGK -# define MALLOCMC_CP_SCATTER_HASHINGK (HashingProperties::hashingK) -#endif - static constexpr uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK; - -#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP -# define MALLOCMC_CP_SCATTER_HASHINGDISTMP (HashingProperties::hashingDistMP) -#endif - static constexpr uint32 hashingDistMP = MALLOCMC_CP_SCATTER_HASHINGDISTMP; - -#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP -# define MALLOCMC_CP_SCATTER_HASHINGDISTWP (HashingProperties::hashingDistWP) -#endif - static constexpr uint32 hashingDistWP = MALLOCMC_CP_SCATTER_HASHINGDISTWP; - -#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL -# define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL (HashingProperties::hashingDistWPRel) -#endif - static constexpr uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL; - - /** Page Table Entry struct - * - * The PTE holds basic information about each page - */ - struct PTE - { - uint32 chunksize; - /** Counter for how many page table entries are used. - * - * This counter is used internally as lock, to guard a full PTE the value must be set to pagesize via - * atomic CAS. - */ - uint32 count; - uint32 bitmask; - - ALPAKA_FN_ACC void init() - { - chunksize = 0; - count = 0; - bitmask = 0; - } - }; - - /** - * Page struct - * The page struct is used to access the data on the page more - * efficiently and to clear the area on the page, which might hold - * bitsfields later one - */ - struct Page - { - char data[pagesize]; - - /** - * The pages init method - * This method initializes the region on the page which might - * hold bit fields when the page is used for a small chunk size - * @param previous_chunksize the chunksize which was uses for - * the page before - */ - ALPAKA_FN_ACC void init() - { - /* Clear the entire data which can hold bitfields. - * volatile avoids that the data is changed within L1 Cache and therefore is hidden for other - * threads. - */ - uint32 volatile* write = (uint32*) (data + pagesize - (int) (sizeof(uint32) * maxOnPageMasks)); - while(write < (uint32*) (data + pagesize)) - *write++ = 0; - } - }; - - // the data used by the allocator - - volatile PTE* _ptes; - uint32 volatile* _regions; - Page* _page; - size_t _memsize; - uint32 _numpages; - uint32 _accessblocks; - uint32 _pagebasedMutex; - uint32 volatile _firstFreePageBased; - uint32 volatile _firstfreeblock; - - /** - * randInit should create an random offset which can be used - * as the initial position in a bitfield - */ - static ALPAKA_FN_ACC inline auto randInit() -> uint32 - { - // start with the laneid offset - return laneid(); - } - - /** - * randInextspot delivers the next free spot in a bitfield - * it searches for the next unset bit to the left of spot and - * returns its offset. if there are no unset bits to the left - * then it wraps around - * @param bitfield the bitfield to be searched for - * @param spot the spot from which to search to the left, range [0,spots) - * @param spots number of bits that can be used - * @return next free spot in the bitfield - */ - static ALPAKA_FN_ACC inline auto nextspot(auto const& acc, uint32 bitfield, uint32 spot, uint32 spots) - -> uint32 - { - uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); - uint32 const high_part = (bitfield << (spots - (spot + 1))); - uint32 const selection_mask = spots == sizeof(uint32) * CHAR_BIT ? ~0 : ((1u << spots) - 1); - // wrap around the bitfields from the current spot to the left - bitfield = (high_part | low_part) & selection_mask; - // compute the step from the current spot in the bitfield - uint32 const step = alpaka::ffs(acc, static_cast>(~bitfield)); - // and return the new spot - return (spot + step) % spots; - } - - /** - * onPageMasksPosition returns a pointer to the beginning of the - * onpagemasks inside a page. - * @param page the page that holds the masks - * @param the number of hierarchical page tables (bitfields) that - * are used inside this mask. - * @return pointer to the first address inside the page that holds - * metadata bitfields. - */ - ALPAKA_FN_ACC inline auto onPageMasksPosition(uint32 page, uint32 nMasks) -> uint32* - { - return (uint32*) (_page[page].data + pagesize - (int) sizeof(uint32) * nMasks); - } - - /** - * usespot marks finds one free spot in the bitfield, marks it and - * returns its offset - * @param bitfield pointer to the bitfield to use - * @param spots overall number of spots the bitfield is responsible - * for - * @return if there is a free spot it returns the spot'S offset, - * otherwise -1 - */ - template - static ALPAKA_FN_ACC inline auto usespot(AlpakaAcc const& acc, uint32* bitfield, uint32 spots) -> int - { - // get first spot - uint32 spot = randInit() % spots; - for(;;) - { - uint32 const mask = 1u << spot; - uint32 const old = alpaka::atomicOp(acc, bitfield, mask); - if((old & mask) == 0) - return spot; - // note: popc(old) == spots should be sufficient, - // but if someone corrupts the memory we end up in an - // endless loop in here... - if(alpaka::popcount(acc, old) >= static_cast(spots)) - return -1; - spot = nextspot(acc, old, spot, spots); - } - } - - /** - * calcAdditionalChunks determines the number of chunks that are - * contained in the last segment of a hierarchical page - * - * The additional checks are necessary to ensure correct results for - * very large pages and small chunksizes - * - * @param fullsegments the number of segments that can be completely - * filled in a page. This may NEVER be bigger than 32! - * @param segmentsize the number of bytes that are contained in a - * completely filled segment (32 chunks) - * @param chunksize the chosen allocation size within the page - * @return the number of additional chunks that will not fit in one - * of the fullsegments. For any correct input, this number is - * smaller than 32 - */ - template - static ALPAKA_FN_ACC inline auto calcAdditionalChunks( - AlpakaAcc const& acc, - uint32 fullsegments, - uint32 segmentsize, - uint32 chunksize) -> uint32 - { - if(fullsegments != 32) - return alpaka::math::min( - acc, - 31U, - alpaka::math::max( - acc, - 0U, - (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32)) - / chunksize); - else - return 0; - } - - /** - * addChunkHierarchy finds a free chunk on a page which uses bit - * fields on the page - * @param chunksize the chunksize of the page - * @param fullsegments the number of full segments on the page (a 32 - * bits on the page) - * @param additional_chunks the number of additional chunks in last - * segment (less than 32 bits on the page) - * @param page the page to use - * @return pointer to a free chunk on the page, 0 if we were unable - * to obtain a free chunk - */ - template - ALPAKA_FN_ACC inline auto addChunkHierarchy( - AlpakaAcc const& acc, - uint32 chunksize, - uint32 fullsegments, - uint32 additional_chunks, - uint32 page) -> void* - { - uint32 const segments = fullsegments + (additional_chunks > 0 ? 1 : 0); - uint32 spot = randInit() % segments; - uint32 const mask = _ptes[page].bitmask; - if((mask & (1u << spot)) != 0) - spot = nextspot(acc, mask, spot, segments); - uint32 const tries = segments - alpaka::popcount(acc, mask); - uint32* onpagemasks = onPageMasksPosition(page, segments); - for(uint32 i = 0; i < tries; ++i) - { - int const hspot = usespot(acc, &onpagemasks[spot], spot < fullsegments ? 32 : additional_chunks); - if(hspot != -1) - return _page[page].data + (32 * spot + hspot) * chunksize; - alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, 1u << spot); - spot = nextspot(acc, mask, spot, segments); - } - return 0; - } - - /** - * addChunkNoHierarchy finds a free chunk on a page which uses the - * bit fields of the pte only - * @param chunksize the chunksize of the page - * @param page the page to use - * @param spots the number of chunks which fit on the page - * @return pointer to a free chunk on the page, 0 if we were unable - * to obtain a free chunk - */ - template - ALPAKA_FN_ACC inline auto addChunkNoHierarchy( - AlpakaAcc const& acc, - uint32 chunksize, - uint32 page, - uint32 spots) -> void* - { - int const spot = usespot(acc, (uint32*) &_ptes[page].bitmask, spots); - if(spot == -1) - return 0; // that should be impossible :) - return _page[page].data + spot * chunksize; - } - - /** - * tryUsePage tries to use the page for the allocation request - * @param page the page to use - * @param chunksize the chunksize of the page - * @param isChunkSizeInRange functor to validate if a given chunk size can be used even if the size is - * different to the parameter chunksize. Required interface: `bool operator()(uint32_t)` returning true if - * range is valid else false - * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk - */ - template - ALPAKA_FN_ACC inline auto tryUsePage( - AlpakaAcc const& acc, - uint32 page, - uint32 chunksize, - T_ChunkSizeRangeCheck&& isChunkSizeInRange) -> void* - { - void* chunk_ptr = nullptr; - - // increse the fill level - uint32 const filllevel = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); - - // if resetfreedpages == false we do not need to re-check chunksize - bool tryAllocMem = !resetfreedpages; - - if(filllevel < pagesize) - { - if constexpr(resetfreedpages) - { - /* Re-check chunk size (it could be that the page got freed in the meanwhile...) - * Use atomic to guarantee that no other thread deleted the page and reinitialized - * it with another chunk size. - * - * In case the page is now free (chunksize == 0) we acquire the new chunk size. - * In cases where the page has already a chunksize we test if the chunksize fits our needs. - */ - uint32 const oldChunksize = alpaka::atomicOp( - acc, - (uint32*) &_ptes[page].chunksize, - 0u, - chunksize); - if(oldChunksize == 0u || isChunkSizeInRange(oldChunksize)) - tryAllocMem = true; - // update the chunk size used for the allocation if the PTE was not empty before. - if(oldChunksize != 0) - chunksize = oldChunksize; - } - } - else - { - // note: if filllevel >= pagesize then page is currently freed by another thread - tryAllocMem = false; - } - - if(tryAllocMem) - { - if(chunksize <= HierarchyThreshold) - { - // more chunks than can be covered by the pte's single - // bitfield can be used - uint32 const segmentsize = chunksize * 32 + sizeof(uint32); - uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); - uint32 const additional_chunks - = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); - if(filllevel < fullsegments * 32 + additional_chunks) - chunk_ptr = addChunkHierarchy(acc, chunksize, fullsegments, additional_chunks, page); - } - else - { - uint32 const chunksinpage = alpaka::math::min(acc, pagesize / chunksize, 32u); - if(filllevel < chunksinpage) - chunk_ptr = addChunkNoHierarchy(acc, chunksize, page, chunksinpage); - } - } - - // this one is full or not useable - if(chunk_ptr == nullptr) - { - uint32_t oldFillLevel - = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); - if(oldFillLevel == 1u) - { - // chunksize guaranteed to hold the chunksize - tryCleanPage(acc, page); - } - } - - return chunk_ptr; - } - - /** - * allocChunked tries to allocate the demanded number of bytes on - * one of the pages - * @param bytes the number of bytes to allocate, must be <=pagesize - * @return pointer to a free chunk on a page, 0 if we were unable to - * obtain a free chunk - */ - template - ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* - { - // use the minimal allocation size to increase the hit rate for small allocations. - uint32 const paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); - uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); - uint32 const numpages = _numpages; - uint32 const pagesperblock = numpages / _accessblocks; - uint32 const reloff = warpSize * minAllocation / pagesize; - uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) - + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc)) - % pagesperblock; - uint32 const maxchunksize = alpaka::math::min( - acc, - +pagesize, - /* this clumping means that allocations of paddedMinChunkSize could have a waste exceeding the - * wastefactor - */ - alpaka::math::max(acc, wastefactor * bytes, paddedMinChunkSize)); - - /* global page index - * - different for each thread to reduce memory read/write conflicts - * - index calculated by the hash function - */ - uint32 const global_start_page = start_page_in_block + _firstfreeblock * pagesperblock; - - uint32 checklevel = regionsize * 3 / 4; - /* Finding a free segment is using a two step approach. - * In both cases each thread will start on a different region and page based on the hash function - * result, this scatters the memory access and reduces access conflicts. Both steps will in the worst - * case iterate over all heap access blocks and pages. - * - step I search for a region which is only filled 3/4 - * - if a free segment is found return - * - step II goto any region independent of the fill level - * - if a free segment is found return - */ - for(uint32 finder = 0; finder < 2; ++finder) - { - uint32 global_page = global_start_page; - /* Loop over all pages until we found a free one or arrived to global_start_page again - * This and the following loop are done as do-while to potentially save registers by avoiding an - * extra loop counter variable - */ - do - { - uint32 const region = global_page / regionsize; - uint32 const regionfilllevel = _regions[region]; - uint32 const region_offset = region * regionsize; - if(regionfilllevel < checklevel) - { - uint32 page_in_region = global_page; - // loop over pages within a region - do - { - // Set the chunk size to our needs. If the old chunk size is not zero we check if we - // can still use the chunk even if memory is waisted. - uint32 beforeChunkSize = alpaka::atomicOp( - acc, - (uint32*) &_ptes[page_in_region].chunksize, - 0u, - minAllocation); - // Check if the chunk size can be used even if the size is not an exact match. - auto const isChunkSizeInRange = [&](uint32_t currentChunkSize) - { return currentChunkSize >= bytes && currentChunkSize <= maxchunksize; }; - uint32_t useChunkSize = 0u; - if(beforeChunkSize == 0u) - { - useChunkSize = minAllocation; - } - else if(isChunkSizeInRange(beforeChunkSize)) - { - // someone else acquired the page, but we can also use it - useChunkSize = beforeChunkSize; - } - if(useChunkSize != 0u) - { - void* res = tryUsePage(acc, page_in_region, useChunkSize, isChunkSizeInRange); - if(res != nullptr) - return res; - } - page_in_region = region_offset + ((page_in_region + 1) % regionsize); - } while(page_in_region != global_page); - - // could not alloc in region, tell that - if(regionfilllevel + 1 <= regionsize) - alpaka::atomicOp( - acc, - (uint32*) (_regions + region), - regionfilllevel, - regionfilllevel + 1); - } - // goto next region - global_page = (global_page + regionsize) % numpages; - // check if we jumped into the next access block - if(global_page % pagesperblock == 0u) - { - uint32 const access_block_id = global_page / pagesperblock; - // randomize the thread writing the info - // Data races are not critical. - if(access_block_id > _firstfreeblock) - _firstfreeblock = access_block_id; - } - - } while(global_page != global_start_page); - - // we are really full :/ so lets search every page for a segment! - checklevel = regionsize + 1; - } - return nullptr; - } - - /** tries to clean-up the page - * - * The last thread reducing the page count to zero should call this method. - */ - template - ALPAKA_FN_ACC void tryCleanPage(AlpakaAcc const& acc, uint32 page) - { - if constexpr(resetfreedpages) - { - /* Workaround for nvcc because the in class defined static constexpr variable can not be passed - * into functions taking a constant reference. - */ - constexpr auto pageSize = pagesize; - /* Try lock the PTE to cleanup the meta data. - * Only the last allocation within the PTE will be successfully lock the PTE. - * In case it is the last allocation on the page the new pagesize will signal full and nobody else - * is allowed to touch the meta data anymore. - */ - auto oldfilllevel - = alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, 0u, pageSize); - - if(oldfilllevel == 0) - { - uint32 const chunksize - = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); - // if chunksize == 0 than another thread cleaned the page already - if(chunksize != 0) - { - // clean meta data bits on the PTE - _page[page].init(); - - /** Take care that the meta data changes where we did not use atomics are propagated to all - * other threads. - */ - alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); - /* Remove chunk information. - * It is important that this call happened after page init is called because scatter malloc - * is updating the chunksize without notify the action by increasing the page count - * beforehand. - */ - auto oldChunkSize = alpaka::atomicOp( - acc, - (uint32*) &_ptes[page].chunksize, - chunksize, - 0u); - if(oldChunkSize != chunksize) - { - // The chunksize can only be changed if it was in between zero. Therefore this code - // should never be reached or we started this method with an outdated chunksize. - printf( - "%u != %u, %u unexpected behaviour during dealloction\n", - oldChunkSize, - chunksize, - page); - } - } - /* Unlock the PTE by reducing the counter. - * In case another allocation is at the same moment trying to allocate memory in tryUsePage() - * the counter can be larger then zero after this dealloc is reducing the counter, this is no - * problem because if the chunk size in tryUsaPage() is not fitting the counter is reduced an - * the page is marked as free. - */ - alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, pageSize); - } - } - } - - /** - * deallocChunked frees the chunk on the page and updates all data - * accordingly - * @param mem pointer to the chunk - * @param page the page the chunk is on - * @param chunksize the chunksize used for the page - */ - template - ALPAKA_FN_ACC void deallocChunked(AlpakaAcc const& acc, void* mem, uint32 page, uint32 chunksize) - { - auto const inpage_offset = static_cast((char*) mem - _page[page].data); - if(chunksize <= HierarchyThreshold) - { - // one more level in hierarchy - uint32 const segmentsize = chunksize * 32 + sizeof(uint32); - uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); - uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); - uint32 const segment = inpage_offset / (chunksize * 32); - uint32 const withinsegment = (inpage_offset - segment * (chunksize * 32)) / chunksize; - // mark it as free - uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); - uint32* onpagemasks = onPageMasksPosition(page, nMasks); - /* currently unchecked: - * uint32 old = */ - alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); - - // always do this, since it might fail due to a - // race-condition with addChunkHierarchy - alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment)); - } - else - { - uint32 const segment = inpage_offset / chunksize; - alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment)); - } - - uint32 oldfilllevel = alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, 1u); - - if(oldfilllevel == 1u) - tryCleanPage(acc, page); - - // meta information counters ... should not be changed by too - // many threads, so.. - if(oldfilllevel == pagesize / 2 / chunksize) - { - uint32 const region = page / regionsize; - alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); - uint32 const pagesperblock = _numpages / _accessblocks; - uint32 const block = page / pagesperblock; - if(warpid(acc) + laneid() == 0) - alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); - } - } - - /** - * markpages markes a fixed number of pages as used - * @param startpage first page to mark - * @param pages number of pages to mark - * @param bytes number of overall bytes to mark pages for - * @return true on success, false if one of the pages is not free - */ - template - ALPAKA_FN_ACC auto markpages(AlpakaAcc const& acc, uint32 startpage, uint32 pages, uint32 bytes) -> bool - { - uint32 abord = std::numeric_limits::max(); - for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage) - { - uint32 const old - = alpaka::atomicOp(acc, (uint32*) &_ptes[trypage].chunksize, 0u, bytes); - if(old != 0) - { - abord = trypage; - break; - } - } - if(abord == std::numeric_limits::max()) - return true; - for(uint32 trypage = startpage; trypage < abord; ++trypage) - alpaka::atomicOp(acc, (uint32*) &_ptes[trypage].chunksize, bytes, 0u); - return false; - } - - /** - * allocPageBasedSingleRegion tries to allocate the demanded number - * of bytes on a continues sequence of pages - * @param startpage first page to be used - * @param endpage last page to be used - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to - * use all the requested pages - */ - template - ALPAKA_FN_ACC auto allocPageBasedSingleRegion( - AlpakaAcc const& acc, - uint32 startpage, - uint32 endpage, - uint32 bytes) -> void* - { - uint32 const pagestoalloc = ceilingDivision(bytes, pagesize); - uint32 freecount = 0; - bool left_free = false; - for(uint32 search_page = startpage + 1; search_page > endpage;) - { - --search_page; - if(_ptes[search_page].chunksize == 0) - { - if(++freecount == pagestoalloc) - { - // try filling it up - if(markpages(acc, search_page, pagestoalloc, bytes)) - { - // mark that we filled up everything up to here - if(!left_free) - alpaka::atomicOp( - acc, - (uint32*) &_firstFreePageBased, - startpage, - search_page - 1); - return _page[search_page].data; - } - } - } - else - { - left_free = true; - freecount = 0; - } - } - return 0; - } - - /** - * allocPageBasedSingle tries to allocate the demanded number of - * bytes on a continues sequence of pages - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to - * use all the requested pages - * @pre only a single thread of a warp is allowed to call the - * function concurrently - */ - template - ALPAKA_FN_ACC auto allocPageBasedSingle(AlpakaAcc const& acc, uint32 bytes) -> void* - { - // acquire mutex - while(alpaka::atomicOp(acc, &_pagebasedMutex, 1u) != 0) - ; - // search for free spot from the back - uint32 const spage = _firstFreePageBased; - void* res = allocPageBasedSingleRegion(acc, spage, 0, bytes); - if(res == 0) - // also check the rest of the pages - res = allocPageBasedSingleRegion(acc, _numpages, spage, bytes); - - // free mutex - alpaka::atomicOp(acc, &_pagebasedMutex, 0u); - return res; - } - - /** - * allocPageBased tries to allocate the demanded number of bytes on - * a continues sequence of pages - * @param bytes number of overall bytes to mark pages for - * @return pointer to the first page to use, 0 if we were unable to - * use all the requested pages - */ - template - ALPAKA_FN_ACC auto allocPageBased(AlpakaAcc const& acc, uint32 bytes) -> void* - { - // this is rather slow, but we dont expect that to happen often - // anyway - - // only one thread per warp can acquire the mutex - void* res = 0; - // based on the alpaka backend the lanemask type can be 64bit - auto const mask = alpaka::warp::activemask(acc); - uint32_t const num = alpaka::popcount(acc, mask); - // based on the alpaka backend the lanemask type can be 64bit - auto const lanemask = lanemask_lt(acc); - uint32_t const local_id = alpaka::popcount(acc, lanemask & mask); - for(unsigned int active = 0; active < num; ++active) - if(active == local_id) - res = allocPageBasedSingle(acc, bytes); - return res; - } - - /** - * deallocPageBased frees the memory placed on a sequence of pages - * @param mem pointer to the first page - * @param page the first page - * @param bytes the number of bytes to be freed - */ - template - ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes) - { - uint32 const pages = ceilingDivision(bytes, pagesize); - for(uint32 p = page; p < page + pages; ++p) - _page[p].init(); - - alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); - - for(uint32 p = page; p < page + pages; ++p) - alpaka::atomicOp(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u); - alpaka::atomicOp(acc, (uint32*) &_firstFreePageBased, page + pages - 1); - } - - public: - /** - * create allocates the requested number of bytes via the heap. - * Coalescing has to be done before by another policy. - * @param bytes number of bytes to allocate - * @return pointer to the allocated memory - */ - template - ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* - { - if(bytes == 0) - return 0; - /* Take care of padding - * bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); - * in alignment-policy. - * bytes == pagesize must be handled by allocChunked() else maxchunksize calculation based - * on the waste factor is colliding with the allocation schema in allocPageBased(). - */ - if(bytes <= pagesize) - // chunck based - return allocChunked(acc, bytes); - else - // allocate a range of pages - return allocPageBased(acc, bytes); - } - - /** - * destroy frees the memory regions previously acllocted via create - * @param mempointer to the memory region to free - */ - template - ALPAKA_FN_ACC void destroy(AlpakaAcc const& acc, void* mem) - { - if(mem == 0) - return; - // lets see on which page we are on - auto const page = static_cast(((char*) mem - (char*) _page) / pagesize); - /* Emulate atomic read. - * In older implementations we read the chunksize without atomics which can result in data races. - */ - uint32 const chunksize - = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); - - // is the pointer the beginning of a chunk? - auto const inpage_offset = static_cast((char*) mem - _page[page].data); - uint32 const block = inpage_offset / chunksize; - uint32 const inblockoffset = inpage_offset - block * chunksize; - if(inblockoffset != 0) - { - uint32* counter = (uint32*) (_page[page].data + block * chunksize); - // coalesced mem free - - uint32 const old = alpaka::atomicOp(acc, counter, 1u); - if(old != 1) - return; - mem = (void*) counter; - } - - if(chunksize <= pagesize) - deallocChunked(acc, mem, page, chunksize); - else - deallocPageBased(acc, mem, page, chunksize); - } - - /** - * init inits the heap data structures - * the init method must be called before the heap can be used. the - * method can be called with an arbitrary number of threads, which - * will increase the inits efficiency - * @param memory pointer to the memory used for the heap - * @param memsize size of the memory in bytes - */ - template - ALPAKA_FN_ACC void initDeviceFunction(AlpakaAcc const& acc, void* memory, size_t memsize) - { - auto const linid = alpaka::getIdx(acc).sum(); - auto const totalThreads = alpaka::getWorkDiv(acc).prod(); - - uint32 numregions = ((unsigned long long) memsize) - / (((unsigned long long) regionsize) * (sizeof(PTE) + pagesize) + sizeof(uint32)); - - uint32 numpages = numregions * regionsize; - // pointer is copied (copy is called page) - Page* page = (Page*) memory; - - // We have to calculate these values here, before using them for other things. - // First calculate how many blocks of the given size fit our memory pages in principle. - // However, we do not have to use the exact requested block size. - // So we redistribute actual memory between the chosen number of blocks - // and ensure that all blocks have the same number of regions. - auto const memorysize = static_cast(numpages) * pagesize; - auto const numblocks = memorysize / accessblocksize; - auto const memoryperblock = memorysize / numblocks; - auto const pagesperblock = memoryperblock / pagesize; - auto const regionsperblock = pagesperblock / regionsize; - numregions = numblocks * regionsperblock; - numpages = numregions * regionsize; - - PTE* ptes = (PTE*) (page + numpages); - uint32* regions = (uint32*) (ptes + numpages); - // sec check for mem size - // this check refers to the original memory-pointer, which was - // not adjusted! - if((char*) (regions + numregions) > (((char*) memory) + memsize)) - { - --numregions; - numpages = alpaka::math::min(acc, numregions * regionsize, numpages); - if(linid == 0) - printf("c Heap Warning: needed to reduce number of " - "regions to stay within memory limit\n"); - } - // Recalculate since numpages could have changed - ptes = (PTE*) (page + numpages); - regions = (uint32*) (ptes + numpages); - - for(uint32 i = linid; i < numpages; i += totalThreads) - { - ptes[i].init(); - page[i].init(); - } - for(uint32 i = linid; i < numregions; i += totalThreads) - regions[i] = 0; - - if(linid == 0) - { - _memsize = memsize; - _numpages = numpages; - _accessblocks = numblocks; - _ptes = (volatile PTE*) ptes; - _page = page; - _regions = regions; - _firstfreeblock = 0; - _pagebasedMutex = 0; - _firstFreePageBased = numpages - 1; - - if((char*) &_page[numpages] > (char*) memory + memsize) - printf("error in heap alloc: numpages too high\n"); - } - } - - static ALPAKA_FN_ACC auto isOOM(void* p, size_t s) -> bool - { - // one thread that requested memory returned null - return s && (p == nullptr); - } - - template - static void initHeap( - AlpakaDevice& dev, - AlpakaQueue& queue, - T_DeviceAllocator* heap, - void* pool, - size_t memsize) - { - if(pool == nullptr && memsize != 0) - { - throw std::invalid_argument("Scatter policy cannot use nullptr for non-empty " - "memory pools. " - "Maybe you are using an incompatible ReservePoolPolicy " - "or AlignmentPolicy."); - } - auto initKernel = [] ALPAKA_FN_ACC( - AlpakaAcc const& m_acc, - T_DeviceAllocator* m_heap, - void* m_heapmem, - size_t m_memsize) { m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); }; - using Dim = typename alpaka::trait::DimType::type; - using Idx = typename alpaka::trait::IdxType::type; - using VecType = alpaka::Vec; - - auto threadsPerBlock = VecType::ones(); - - auto const devProps = alpaka::getAccDevProps(dev); - - threadsPerBlock[Dim::value - 1] - = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); - - auto const workDiv = alpaka::WorkDivMembers{ - VecType::ones(), - threadsPerBlock, - VecType::ones()}; // Dim may be any dimension, but workDiv is 1D - alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, initKernel, heap, pool, memsize)); - } - - /** counts how many elements of a size fit inside a given page - * - * Examines a (potentially already used) page to find how many - * elements of size chunksize still fit on the page. This includes - * hierarchically organized pages and empty pages. The algorithm - * determines the number of chunks in the page in a manner similar - * to the allocation algorithm of CreationPolicies::Scatter. - * - * @param page the number of the page to examine. The page needs to - * be formatted with a chunksize and potentially a hierarchy. - * @param chunksize the size of element that should be placed inside - * the page. This size must be appropriate to the formatting of the - * page. - */ - template - ALPAKA_FN_ACC auto countFreeChunksInPage(AlpakaAcc const& acc, uint32 page, uint32 chunksize) -> unsigned - { - uint32 const filledChunks = _ptes[page].count; - if(chunksize <= HierarchyThreshold) - { - uint32 const segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 - // 2nd-level chunks - uint32 const fullsegments = alpaka::math::min( - acc, - 32u, - pagesize / segmentsize); // there might be space for - // more than 32 segments - // with 32 2nd-level chunks - uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); - uint32 const level2Chunks = fullsegments * 32 + additional_chunks; - return level2Chunks - filledChunks; - } - else - { - uint32 const chunksinpage = alpaka::math::min( - acc, - pagesize / chunksize, - 32u); // without hierarchy, there can not be more than - // 32 chunks - return chunksinpage - filledChunks; - } - } - - /** counts the number of available slots inside the heap - * - * Searches the heap for all possible locations of an element with - * size slotSize. The used traversal algorithms are similar to the - * allocation strategy of CreationPolicies::Scatter, to ensure - * comparable results. There are 3 different algorithms, based on - * the size of the requested slot: 1 slot spans over multiple pages, - * 1 slot fits in one chunk within a page, 1 slot fits in a fraction - * of a chunk. - * - * @param slotSize the amount of bytes that a single slot accounts - * for - * @param gid the id of the thread. this id does not have to - * correspond with threadId.x, but there must be a continous range - * @param stride the stride should be equal to the number of - * different gids (and therefore of value max(gid)-1) - */ - template - ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction( - AlpakaAcc const& acc, - size_t slotSize, - uint32 gid, - uint32 stride) -> unsigned - { - unsigned slotcount = 0; - if(slotSize < pagesize) - { // multiple slots per page - for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride) - { - uint32 const maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * (uint32) slotSize); - - uint32 chunksize = _ptes[currentpage].chunksize; - if(chunksize >= slotSize && chunksize <= maxchunksize) - { // how many chunks left? (each chunk is big enough) - slotcount += countFreeChunksInPage(acc, currentpage, chunksize); - } - else if(chunksize == 0) - { - chunksize = alpaka::math::max( - acc, - (uint32) slotSize, - T_AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size - slotcount += countFreeChunksInPage( - acc, - currentpage, - chunksize); // how many chunks fit in one page? - } - else - { - continue; // the chunks on this page are too small - // for the request :( - } - } - } - else - { // 1 slot needs multiple pages - if(gid > 0) - return 0; // do this serially - uint32 const pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); - uint32 freecount = 0; - for(uint32 currentpage = _numpages; currentpage > 0;) - { // this already includes all superblocks - --currentpage; - if(_ptes[currentpage].chunksize == 0) - { - if(++freecount == pagestoalloc) - { - freecount = 0; - ++slotcount; - } - } - else - { // the sequence of free pages was interrupted - freecount = 0; - } - } - } - return slotcount; - } - - /** Count, how many elements can be allocated at maximum - * - * Takes an input size and determines, how many elements of this - * size can be allocated with the CreationPolicy Scatter. This will - * return the maximum number of free slots of the indicated size. It - * is not guaranteed where these slots are (regarding - * fragmentation). Therefore, the practically usable number of slots - * might be smaller. This function is executed in parallel. Speedup - * can possibly increased by a higher amount ofparallel workers. - * - * @param slotSize the size of allocatable elements to count - * @param obj a reference to the allocator instance (host-side) - */ - - public: - template - static auto getAvailableSlotsHost( - AlpakaDevice& dev, - AlpakaQueue& queue, - size_t const slotSize, - T_DeviceAllocator* heap) -> unsigned - { - auto d_slots = alpaka::allocBuf(dev, 1); - alpaka::memset(queue, d_slots, 0, 1); - - auto getAvailableSlotsKernel = [] ALPAKA_FN_ACC( - AlpakaAcc const& acc, - T_DeviceAllocator* heapPtr, - size_t numBytes, - unsigned* slots) -> void - { - auto const gid = alpaka::getIdx(acc).sum(); - - auto const nWorker = alpaka::getWorkDiv(acc).prod(); - unsigned const temp - = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker); - if(temp) - alpaka::atomicOp(acc, slots, temp); - }; - - using Dim = typename alpaka::trait::DimType::type; - using Idx = typename alpaka::trait::IdxType::type; - - using VecType = alpaka::Vec; - - auto numBlocks = VecType::ones(); - numBlocks[Dim::value - 1] = 64u; - auto threadsPerBlock = VecType::ones(); - - auto const devProps = alpaka::getAccDevProps(dev); - - threadsPerBlock[Dim::value - 1] - = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); - - auto const workDiv = alpaka::WorkDivMembers{ - numBlocks, - threadsPerBlock, - VecType::ones()}; // Dim may be any dimension, but workDiv is 1D - - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, - getAvailableSlotsKernel, - heap, - slotSize, - alpaka::getPtrNative(d_slots))); - - auto const platform = alpaka::Platform{}; - auto const hostDev = alpaka::getDevByIdx(platform, 0); - - auto h_slots = alpaka::allocBuf(hostDev, 1); - alpaka::memcpy(queue, h_slots, d_slots, 1); - alpaka::wait(queue); - - return *alpaka::getPtrNative(h_slots); - } - - /** Count, how many elements can be allocated at maximum - * - * Takes an input size and determines, how many elements of this - * size can be allocated with the CreationPolicy Scatter. This will - * return the maximum number of free slots of the indicated size. It - * is not guaranteed where these slots are (regarding - * fragmentation). Therefore, the practically usable number of slots - * might be smaller. This function is executed separately for each - * warp and does not cooperate with other warps. Maximum speed is - * expected if every thread in the warp executes the function. Uses - * 256 byte of shared memory. - * - * @param slotSize the size of allocatable elements to count - */ - template - ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned - { - int const wId = warpid_withinblock(acc); // do not use warpid-function, since - // this value is not guaranteed to - // be stable across warp lifetime - - uint32 const activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); - - constexpr auto warpsize = warpSize; - auto& activePerWarp = alpaka::declareSharedVar< - std::uint32_t[maxThreadsPerBlock / warpsize], - __COUNTER__>(acc); // maximum number of warps in a block - - auto& warpResults - = alpaka::declareSharedVar], __COUNTER__>(acc); - - warpResults[wId] = 0; - activePerWarp[wId] = 0; - - // wait that all shared memory is initialized - alpaka::syncBlockThreads(acc); - - // the active threads obtain an id from 0 to activeThreads-1 - if(slotSize == 0) - return 0; - auto const linearId = alpaka::atomicOp(acc, &activePerWarp[wId], 1u); - - // printf("Block %d, id %d: activeThreads=%d - // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - unsigned const temp - = this->getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads); - if(temp) - alpaka::atomicOp(acc, &warpResults[wId], temp); - - alpaka::syncBlockThreads(acc); - alpaka::mem_fence(acc, alpaka::memory_scope::Block{}); - - return warpResults[wId]; - } - - static auto classname() -> std::string - { - std::stringstream ss; - ss << "Scatter["; - ss << "pagesize=" << pagesize << ","; - ss << "accessblocksize=" << accessblocksize << ","; - ss << "regionsize=" << regionsize << ","; - ss << "wastefactor=" << wastefactor << ","; - ss << "resetfreedpages=" << resetfreedpages << ","; - ss << "minChunkSize=" << minChunkSize << ","; - ss << "HierarchyThreshold=" << HierarchyThreshold << ","; - ss << "hashingK=" << hashingK << ","; - ss << "hashingDistMP=" << hashingDistMP << ","; - ss << "hashingDistWP=" << hashingDistWP << ","; - ss << "hashingDistWPRel=" << hashingDistWPRel << "]"; - return ss.str(); - } - }; - - template - struct Scatter - { - template - using AlignmentAwarePolicy = ScatterImpl; - }; - - } // namespace CreationPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp b/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp deleted file mode 100644 index 0f6fe090d0..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian J. Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "mallocMC_traits.hpp" - -#include - -#include -#include - -namespace mallocMC -{ - /** - * @brief "HostClass" that combines all policies to a useful allocator - * - * This class implements the necessary glue-logic to form an actual - * allocator from the provided policies. It implements the public interface - * and executes some constraint checking based on an instance of the class - * PolicyConstraints. - * - * @tparam T_CreationPolicy The desired type of a CreationPolicy - * @tparam T_DistributionPolicy The desired type of a DistributionPolicy - * @tparam T_OOMPolicy The desired type of a OOMPolicy - * @tparam T_ReservePoolPolicy The desired type of a ReservePoolPolicy - * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy - */ - template< - typename T_CreationPolicy, - typename T_DistributionPolicy, - typename T_OOMPolicy, - typename T_AlignmentPolicy> - class DeviceAllocator : public T_CreationPolicy::template AlignmentAwarePolicy - { - using uint32 = std::uint32_t; - - public: - using CreationPolicy = T_CreationPolicy; - using DistributionPolicy = T_DistributionPolicy; - using OOMPolicy = T_OOMPolicy; - using AlignmentPolicy = T_AlignmentPolicy; - - template - ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void* - { - if(bytes == 0U) - { - return nullptr; - } - bytes = AlignmentPolicy::applyPadding(bytes); - DistributionPolicy distributionPolicy(acc); - uint32 const req_size = distributionPolicy.collect(acc, bytes); - void* memBlock = CreationPolicy::template AlignmentAwarePolicy::create(acc, req_size); - if(CreationPolicy::isOOM(memBlock, req_size)) - { - memBlock = OOMPolicy::handleOOM(memBlock); - } - return distributionPolicy.distribute(acc, memBlock); - } - - template - ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* pointer) - { - if(pointer != nullptr) - { - CreationPolicy::template AlignmentAwarePolicy::destroy(acc, pointer); - } - } - - /** Provide the number of available free slots. - * - * @tparam AlpakaAcc The type of the Allocator to be used - * @param acc alpaka accelerator - * @param slotSize assumed allocation size in bytes - * @return number of free slots of the given size, if creation policy is not providing the information on the - * device side 0 will be returned. - */ - template - ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned - { - slotSize = AlignmentPolicy::applyPadding(slotSize); - if constexpr(Traits::providesAvailableSlots) - { - return CreationPolicy::template AlignmentAwarePolicy::getAvailableSlotsAccelerator( - acc, - slotSize); - } - else - { - return 0U; - } - } - }; - -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp deleted file mode 100644 index 98b2968e4d..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "Noop.hpp" - -#include - -#include -#include - -namespace mallocMC -{ - namespace DistributionPolicies - { - /** - * @brief a policy that does nothing - * - * This DistributionPolicy will not perform any distribution, but only - * return its input (identity function) - */ - class Noop - { - using uint32 = std::uint32_t; - - public: - template - ALPAKA_FN_ACC Noop(AlpakaAcc const& /*acc*/) - { - } - - template - ALPAKA_FN_ACC auto collect(AlpakaAcc const& /*acc*/, uint32 bytes) const -> uint32 - { - return bytes; - } - - template - ALPAKA_FN_ACC auto distribute(AlpakaAcc const& /*acc*/, void* allocatedMem) const -> void* - { - return allocatedMem; - } - - static auto classname() -> std::string - { - return "Noop"; - } - }; - - } // namespace DistributionPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp deleted file mode 100644 index cdcc822c8f..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ /dev/null @@ -1,194 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "../mallocMC_utils.hpp" -#include "XMallocSIMD.hpp" - -#include -#include - -#include -#include -#include -#include - -namespace mallocMC -{ - namespace DistributionPolicies - { - namespace XMallocSIMDConf - { - struct DefaultXMallocConfig - { - static constexpr auto pagesize = 4096; - }; - } // namespace XMallocSIMDConf - - /** - * @brief SIMD optimized chunk resizing in the style of XMalloc - * - * This DistributionPolicy can take the memory requests from a group of - * worker threads and combine them, so that only one of the workers will - * allocate the whole request. Later, each worker gets an appropriate - * offset into the allocated chunk. This is beneficial for SIMD - * architectures since only one of the workers has to compete for the - * resource. This algorithm is inspired by the XMalloc memory allocator - * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) - * and its implementation in ScatterAlloc - * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604) - * XMallocSIMD is inteded to be used with Nvidia CUDA capable - * accelerators that support at least compute capability 2.0 - * - * @tparam T_Config (optional) The configuration struct to overwrite - * default configuration. The default can be obtained through - * XMallocSIMD<>::Properties - */ - template - class XMallocSIMD - { - private: - using uint32 = std::uint32_t; - bool can_use_coalescing; - uint32 warpid; - uint32 myoffset; - uint32 threadcount; - uint32 req_size; - - public: - using Properties = T_Config; - - template - ALPAKA_FN_ACC XMallocSIMD(AlpakaAcc const& acc) - : can_use_coalescing(false) - , warpid(warpid_withinblock(acc)) - , myoffset(0) - , threadcount(0) - , req_size(0) - { - } - - private: -/** Allow for a hierarchical validation of parameters: - * - * shipped default-parameters (in the inherited struct) have lowest precedence. - * They will be overridden by a given configuration struct. However, even the - * given configuration struct can be overridden by compile-time command line - * parameters (e.g. -D MALLOCMC_DP_XMALLOCSIMD_PAGESIZE 1024) - * - * default-struct < template-struct < command-line parameter - */ -#ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE -# define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE (Properties::pagesize) -#endif - static constexpr uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE; - - public: - static constexpr uint32 _pagesize = pagesize; - - template - ALPAKA_FN_ACC auto collect(AlpakaAcc const& acc, uint32 bytes) -> uint32 - { - can_use_coalescing = false; - myoffset = 0; - threadcount = 0; - - // init with initial counter - auto& warp_sizecounter - = alpaka::declareSharedVar()], __COUNTER__>( - acc); - warp_sizecounter[warpid] = 16; - - // second half: make sure that all coalesced allocations can fit - // within one page necessary for offset calculation - bool const coalescible = bytes > 0 && bytes < (pagesize / 32); - -#if(MALLOCMC_DEVICE_COMPILE) - threadcount = alpaka::popcount(alpaka::warp::ballot(acc, coalescible)); -#else - threadcount = 1; // TODO -#endif - if(coalescible && threadcount > 1) - { - myoffset = alpaka::atomicOp(acc, &warp_sizecounter[warpid], bytes); - can_use_coalescing = true; - } - - req_size = bytes; - if(can_use_coalescing) - req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0; - - return req_size; - } - - template - ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void* - { - auto& warp_res - = alpaka::declareSharedVar()], __COUNTER__>(acc); - - char* myalloc = (char*) allocatedMem; - if(req_size && can_use_coalescing) - { - warp_res[warpid] = myalloc; - if(myalloc != 0) - *(uint32*) myalloc = threadcount; - } - - threadfenceBlock(acc); - - void* myres = myalloc; - if(can_use_coalescing) - { - if(warp_res[warpid] != 0) - myres = warp_res[warpid] + myoffset; - else - myres = 0; - } - return myres; - } - - ALPAKA_FN_HOST - static auto classname() -> std::string - { - std::stringstream ss; - ss << "XMallocSIMD[" << pagesize << "]"; - return ss.str(); - } - }; - - } // namespace DistributionPolicies - -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp deleted file mode 100644 index de96b7e00c..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - https://www.hzdr.de/crp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Bernhard Kainz - kainz ( at ) icg.tugraz.at - Michael Kenzel - kenzel ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -// generic stuff -#include "version.hpp" - -// core functionality -#include "mallocMC_hostclass.hpp" - -// all the policies -#include "alignmentPolicies/Noop.hpp" -#include "alignmentPolicies/Shrink.hpp" -#include "creationPolicies/FlatterScatter.hpp" -#include "creationPolicies/OldMalloc.hpp" -#include "creationPolicies/Scatter.hpp" -#include "distributionPolicies/Noop.hpp" -#include "distributionPolicies/XMallocSIMD.hpp" -#include "oOMPolicies/BadAllocException.hpp" -#include "oOMPolicies/ReturnNull.hpp" -#include "reservePoolPolicies/AlpakaBuf.hpp" -#include "reservePoolPolicies/CudaSetLimits.hpp" diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp deleted file mode 100644 index 1da222fa4d..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2015 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include - -namespace mallocMC -{ - template - struct AllocatorHandleImpl - { - using DevAllocator = typename T_HostAllocator::DevAllocator; - - DevAllocator* devAllocator; - - explicit AllocatorHandleImpl(DevAllocator* p) : devAllocator(p) - { - } - - template - ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t size) -> void* - { - return devAllocator->malloc(acc, size); - } - - template - ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) - { - devAllocator->free(acc, p); - } - - template - ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned - { - return devAllocator->getAvailableSlots(acc, slotSize); - } - }; - -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp deleted file mode 100644 index 0fb2099dfe..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "creationPolicies/Scatter.hpp" -#include "distributionPolicies/XMallocSIMD.hpp" - -namespace mallocMC -{ - /** The default PolicyCheckers (do always succeed) - */ - template - class PolicyCheck1 - { - }; - - template - class PolicyCheck2 - { - }; - - template - class PolicyCheck3 - { - }; - - template - class PolicyCheck4 - { - }; - - template - class PolicyCheck5 - { - }; - - /** Enforces constraints on policies or combinations of polices - * - * Uses template specialization of PolicyChecker - */ - template< - typename T_CreationPolicy, - typename T_DistributionPolicy, - typename T_OOMPolicy, - typename T_GetHeapPolicy, - typename T_AlignmentPolicy> - - class PolicyConstraints : PolicyCheck2 - { - }; - - /** Scatter and XMallocSIMD need the same pagesize! - * - * This constraint ensures that if the CreationPolicy "Scatter" and the - * DistributionPolicy "XMallocSIMD" are selected, they are configured to use - * the same value for their "pagesize"-parameter. - */ - template - class PolicyCheck2, typename DistributionPolicies::XMallocSIMD> - { - static_assert(x::pagesize == z::pagesize, "Pagesize must be the same when combining Scatter and XMallocSIMD"); - }; - -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp deleted file mode 100644 index 48bc1f748b..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2015 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "allocator.hpp" -#include "device_allocator.hpp" -#include "mallocMC_traits.hpp" diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp deleted file mode 100644 index 091687e149..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 - 2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -namespace mallocMC -{ - template - struct Traits - { - static constexpr bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots; - }; -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp deleted file mode 100644 index ad43eb49eb..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - https://www.hzdr.de/crp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Michael Kenzel - kenzel ( at ) icg.tugraz.at - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include -#include - -#include - -#ifdef _MSC_VER -# include -#endif - -#include -#include - -/* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__ - * only is used to detect the device compile path. - * Since we require devices with support for ballot we can high-jack __HIP_ARCH_HAS_WARP_BALLOT__. - */ -#if(defined(__HIP_ARCH_HAS_WARP_BALLOT__) || defined(__CUDA_ARCH__) || __HIP_DEVICE_COMPILE__ == 1) -# define MALLOCMC_DEVICE_COMPILE 1 -#endif - -namespace mallocMC -{ - - template - constexpr uint32_t warpSize = 1U; - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - template - constexpr uint32_t warpSize> = 32U; -#endif - -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED -# if(HIP_VERSION_MAJOR >= 4) - template - constexpr uint32_t warpSize> = __AMDGCN_WAVEFRONT_SIZE; -# else - template - constexpr uint32_t warpSize> = 64; -# endif -#endif - - ALPAKA_FN_ACC inline auto laneid() - { -#if defined(__CUDA_ARCH__) - std::uint32_t mylaneid; - asm("mov.u32 %0, %%laneid;" : "=r"(mylaneid)); - return mylaneid; -#elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__) - return __lane_id(); -#else - return 0U; -#endif - } - - /** warp index within a multiprocessor - * - * Index of the warp within the multiprocessor at the moment of the query. - * The result is volatile and can be different with each query. - * - * @return current index of the warp - */ - template - ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t - { - return 0U; - } - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - template - // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is - // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.` - inline __device__ auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t - { - std::uint32_t mywarpid = 0; - asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); - return mywarpid; - } -#endif - -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - template - ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t - { - // get wave id - // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024 - return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4)); - } -#endif - - template - ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t - { - return 0U; - } - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - template - ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t - { - std::uint32_t mysmid = 0; - asm("mov.u32 %0, %%smid;" : "=r"(mysmid)); - return mysmid; - } -#endif - -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - template - ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t - { - return __smid(); - } -#endif - - template - ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/) - { - return 0U; - } -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - template - ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt const& /*acc*/) - { - std::uint32_t lanemask; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask)); - return lanemask; - } -#endif - -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - template - ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt const& /*acc*/) - { - return __lanemask_lt(); - } -#endif - - - /** the maximal number threads per block, valid for sm_2.X - sm_7.5 - * - * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities - */ - constexpr uint32_t maxThreadsPerBlock = 1024U; - - /** warp id within a cuda block - * - * The id is constant over the lifetime of the thread. - * The id is not equal to warpid(). - * - * @return warp id within the block - */ - template - ALPAKA_FN_ACC inline auto warpid_withinblock(AlpakaAcc const& acc) -> std::uint32_t - { - auto const localId = alpaka::mapIdx<1>( - alpaka::getIdx(acc), - alpaka::getWorkDiv(acc))[0]; - return localId / warpSize; - } - - template && std::is_integral_v>> - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T - { - return (numerator + (denominator - 1)) / denominator; - } - - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf( - void const* const pointer, - void const* const start, - T_size const stepSize) -> std::make_signed_t - { - return std::distance(reinterpret_cast(start), reinterpret_cast(pointer)) / stepSize; - } - - template - ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target) - { - return alpaka::atomicCas(acc, &target, static_cast(0U), static_cast(0U)); - } -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp deleted file mode 100644 index 7d7dfcad3a..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "BadAllocException.hpp" - -#include - -#include -#include - -namespace mallocMC -{ - namespace OOMPolicies - { - /** - * @brief Throws a std::bad_alloc exception on OutOfMemory - * - * This OOMPolicy will throw a std::bad_alloc exception, if the - * accelerator supports it. Currently, Nvidia CUDA does not support any - * form of exception handling, therefore handleOOM() does not have any - * effect on these accelerators. Using this policy on other types of - * accelerators that do not support exceptions results in undefined - * behaviour. - */ - struct BadAllocException - { - ALPAKA_FN_ACC - static auto handleOOM(void* mem) -> void* - { -#if BOOST_LANG_CUDA || BOOST_COMP_HIP -// #if __CUDA_ARCH__ < 350 -# define PM_EXCEPTIONS_NOT_SUPPORTED_HERE -// #endif -#endif - -#ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE -# undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE - assert(false); -#else - throw std::bad_alloc{}; -#endif - return mem; - } - - static auto classname() -> std::string - { - return "BadAllocException"; - } - }; - - } // namespace OOMPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp deleted file mode 100644 index dbea98e703..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include "ReturnNull.hpp" - -#include - -#include - -namespace mallocMC -{ - namespace OOMPolicies - { - /** - * @brief Returns a nullptr pointer on OutOfMemory conditions - * - * This OOMPolicy will return nullptr, if handleOOM() is called. - */ - class ReturnNull - { - public: - ALPAKA_FN_ACC - static auto handleOOM(void* mem) -> void* - { - return nullptr; - } - - static auto classname() -> std::string - { - return "ReturnNull"; - } - }; - - } // namespace OOMPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp deleted file mode 100644 index 4426b3c19e..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2020-2024 Helmholtz-Zentrum Dresden - Rossendorf, - CERN - - Author(s): Bernhard Manfred Gruber - Julian J. Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include - -#include -#include - -namespace mallocMC -{ - namespace ReservePoolPolicies - { - template - struct AlpakaBuf - { - template - auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* - { - poolBuffer = std::make_unique(alpaka::allocBuf(dev, memsize)); - return alpaka::getPtrNative(*poolBuffer); - } - - void resetMemPool() - { - poolBuffer = {}; - } - - static auto classname() -> std::string - { - return "AlpakaBuf"; - } - - private: - using PoolBufferType = alpaka::Buf, unsigned char, alpaka::DimInt<1>, size_t>; - std::unique_ptr poolBuffer; // FIXME(bgruber): replace by std::optional<> - }; - } // namespace ReservePoolPolicies -} // namespace mallocMC diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp deleted file mode 100644 index 99bf4b86b4..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - -# include "CudaSetLimits.hpp" - -# include - -# include -# include - -namespace mallocMC -{ - namespace ReservePoolPolicies - { - /** - * @brief set CUDA internal heap for device-side malloc calls - * - * This ReservePoolPolicy is intended for use with CUDA capable - * accelerators that support at least compute capability 2.0. It should - * be used in conjunction with a CreationPolicy that actually requires - * the CUDA-internal heap to be sized by calls to cudaDeviceSetLimit(). - * - * This policy sets the cudaLimitMallocHeapSize device limit. This value - * can no longer be changed once a kernel using ::malloc()/::free() has - * been run. Subsequent attempts will result in errors unless the device - * is reset via cudaDeviceReset(). See: - * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g05956f16eaa47ef3a4efee84563ccb7d - */ - // TODO alpaka - struct CudaSetLimits - { - template - auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* - { - cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize); - return nullptr; - } - - static void resetMemPool() - { - cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U); - cudaGetLastError(); // cudaDeviceSetLimit() usually fails if any - // kernel before used ::malloc(), so let's - // clear the error state - } - - static auto classname() -> std::string - { - return "CudaSetLimits"; - } - }; - - } // namespace ReservePoolPolicies -} // namespace mallocMC - -#endif diff --git a/thirdParty/mallocMC/src/include/mallocMC/version.hpp b/thirdParty/mallocMC/src/include/mallocMC/version.hpp deleted file mode 100644 index 89b9424047..0000000000 --- a/thirdParty/mallocMC/src/include/mallocMC/version.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - http://www.icg.tugraz.at/project/mvp - https://www.hzdr.de/crp - - Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology - Copyright (C) 2014-2024 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Bernhard Kainz - kainz ( at ) icg.tugraz.at - Michael Kenzel - kenzel ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de - Julian Lenz - j.lenz ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -/** the mallocMC version: major API changes should be reflected here */ -#define MALLOCMC_VERSION_MAJOR 3 -#define MALLOCMC_VERSION_MINOR 0 -#define MALLOCMC_VERSION_PATCH 0 - -/** the mallocMC flavor is used to differentiate the releases of the - * Computational Radiation Physics group (crp) from other releases - * This should be useful to avoid versioning conflicts */ -#define MALLOCMC_FLAVOR "crp" diff --git a/thirdParty/mallocMC/tests/thread-safety/AccessBlock.cpp b/thirdParty/mallocMC/tests/thread-safety/AccessBlock.cpp deleted file mode 100644 index 62811a4416..0000000000 --- a/thirdParty/mallocMC/tests/thread-safety/AccessBlock.cpp +++ /dev/null @@ -1,927 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - - -#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" - -#include "../unit/mocks.hpp" -#include "mallocMC/mallocMC_utils.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock; - -using Dim = alpaka::DimInt<1>; -using Idx = std::uint32_t; - - -constexpr uint32_t pageSize = 1024; -constexpr uint32_t numPages = 4; -// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): -constexpr uint32_t pteSize = 4 + 4; -constexpr uint32_t blockSize = numPages * (pageSize + pteSize); - -using MyAccessBlock = AccessBlock, AlignmentPolicy>; -using std::span; - -// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the -// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen -// regardless of the underlying access optimisations etc. - -struct FillWith -{ - template - ALPAKA_FN_ACC auto operator()( - TAcc const& acc, - AccessBlock, AlignmentPolicy>* accessBlock, - uint32_t const chunkSize, - void** result, - uint32_t const size) const -> void - { - std::generate( - result, - result + size, - [&acc, accessBlock, chunkSize]() - { - void* pointer{nullptr}; - while(pointer == nullptr) - { - pointer = accessBlock->create(acc, chunkSize); - } - return pointer; - }); - } -}; - -struct ContentGenerator -{ - uint32_t counter{0U}; - - ALPAKA_FN_ACC auto operator()() -> uint32_t - { - return counter++; - } -}; - -ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) -{ - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < size) - { - functor(idx); - } - } -} - -struct Create -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); - } - - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const - { - forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); - } -}; - -struct CreateUntilSuccess -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct Destroy -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const - { - forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); - } -}; - -struct IsValid -{ - template - ALPAKA_FN_ACC auto operator()( - TAcc const& acc, - auto* accessBlock, - void** pointers, - bool* results, - uint32_t const size) const - { - std::span tmpPointers(pointers, size); - std::span tmpResults(results, size); - std::transform( - std::begin(tmpPointers), - std::end(tmpPointers), - std::begin(tmpResults), - [&acc, accessBlock](auto pointer) { return accessBlock->isValid(acc, pointer); }); - } -}; - -using Host = alpaka::AccCpuSerial; - -template -struct Buffer -{ - TDevAcc m_devAcc; - TDevHost m_devHost; - - alpaka::Vec m_extents; - - alpaka::Buf m_onDevice; - alpaka::Buf m_onHost; - - Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) - : m_devAcc{devAcc} - , m_devHost{devHost} - , m_extents{extents} - , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) - , m_onHost(alpaka::allocBuf(devHost, m_extents)) - { - } -}; - -template -auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) -{ - return Buffer{devHost, devAcc, extents}; -} - -auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) -{ - auto chunkSizes = makeBuffer(devHost, devAcc, 2U); - chunkSizes.m_onHost[0] = 32U; - chunkSizes.m_onHost[1] = 512U; - alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); - return chunkSizes; -} - -auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) -{ - auto pointers = makeBuffer(devHost, devAcc, size); - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); - alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); - return pointers; -} - -template -auto setup() -{ - alpaka::Platform const platformAcc = {}; - alpaka::Platform> const platformHost = {}; - alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); - alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); - alpaka::Queue queue{devAcc}; - return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); -} - -template -auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers -{ - if constexpr(std::is_same_v, alpaka::TagCpuSerial>) - { - return {{1U}, {1U}, {numElements}}; - } - else - { - alpaka::KernelCfg const kernelCfg - = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; - return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); - } -} - -template -auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) -{ - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::exec( - queue, - workDivSingleThread, - FillWith{}, - accessBlock, - chunkSize, - alpaka::getPtrNative(pointers.m_onDevice), - pointers.m_extents[0]); - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); -} - -template -auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) -{ - fillWith(queue, accessBlock, chunkSize, pointers); - auto* pointer1 = pointers.m_onHost[0]; - - // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in - // devPointers, so we don't need to wait for the copy before to finish. - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::exec( - queue, - workDivSingleThread, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); - alpaka::wait(queue); - return pointer1; -} - -template -auto freeAllButOneOnFirstPage( - auto& queue, - AccessBlock, AlignmentPolicy>* accessBlock, - auto& pointers) -{ - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::sort(std::begin(tmp), std::end(tmp)); - // This points to the first chunk of page 0. - auto* pointer1 = tmp[0]; - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); - alpaka::wait(queue); - auto size - = pointers.m_extents[0] / AccessBlock, AlignmentPolicy>::numPages() - 1; - // Delete all other chunks on page 0. - customExec( - queue, - pointers.m_devAcc, - size, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); - alpaka::wait(queue); - return pointer1; -} - -struct CheckContent -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) - const - { - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < pointers.size()) - { - auto* begin = reinterpret_cast(pointers[idx]); - auto* end = begin + chunkSize / sizeof(uint32_t); - results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); - } - } - } -}; - -template -auto checkContent( - auto& devHost, - auto& devAcc, - auto& queue, - auto& pointers, - auto& content, - auto& workDiv, - auto const chunkSize) -{ - auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); - alpaka::exec( - queue, - workDiv, - CheckContent{}, - alpaka::getPtrNative(content.m_onDevice), - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - alpaka::getPtrNative(results.m_onDevice), - chunkSize); - alpaka::wait(queue); - alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); - alpaka::wait(queue); - - - std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); - auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); - - return writtenCorrectly; -} - -struct GetAvailableSlots -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const - { - *result = accessBlock->getAvailableSlots(acc, chunkSize); - }; -}; - -template -auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) -{ - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::wait(queue); - auto result = makeBuffer(devHost, devAcc, 1U); - alpaka::wait(queue); - alpaka::exec( - queue, - workDivSingleThread, - GetAvailableSlots{}, - accessBlock, - chunkSize, - alpaka::getPtrNative(result.m_onDevice)); - alpaka::wait(queue); - alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); - alpaka::wait(queue); - auto tmp = result.m_onHost[0]; - alpaka::wait(queue); - return tmp; -} - -template -auto pageIndex(AccessBlock, AlignmentPolicy>* accessBlock, auto* pointer) -{ - // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). - // But we assume that the access block starts with the first page, so the pointer to the first page equals the - // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. - return mallocMC::indexOf(pointer, accessBlock, T_pageSize); -} - -struct FillAllUpAndWriteToThem -{ - ALPAKA_FN_ACC auto operator()( - auto const& acc, - auto* accessBlock, - auto* content, - span pointers, - auto chunkSize) const - { - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < pointers.size()) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - auto* begin = reinterpret_cast(pointers[idx]); - auto* end = begin + chunkSize / sizeof(uint32_t); - std::fill(begin, end, content[idx]); - } - } - } -}; - -struct CreateAndDestroMultipleTimes -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = nullptr; - for(uint32_t j = 0; j < idx; ++j) - { - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - accessBlock->destroy(acc, pointers[idx]); - pointers[idx] = nullptr; - } - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct OversubscribedCreation -{ - uint32_t oversubscriptionFactor{}; - uint32_t availableSlots{}; - - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = nullptr; - for(uint32_t j = 0; j < idx + 1; ++j) - { - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - - // CAUTION: The following lines have cost us more than a working day of debugging! - // If the hardware you're running on has a single program counter for the whole warp, the whole - // warp can't exit the while loop in case of even a single thread requesting another round. - // This implies that if we move the `.destroy()` out of the while loop, all the slots get - // filled up but the owning threads run idle instead of freeing them up again because they are - // waiting for their last companions to give their okay for exiting the loop. This is, of - // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in - // this scenario). So, this loop deadlocks and no thread ever exits. - // - // ... at least that's what we believe. If you're reading this comment, we might have been - // wrong about this. - if(pointers[idx] != nullptr) - { - accessBlock->destroy(acc, pointers[idx]); - } - } - pointers[idx] = nullptr; - } - - // We only keep some of the memory. In particular, we keep one chunk less than is available, - // such that threads looking for memory after we've finished can still find some. - while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct CreateAllChunkSizes -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) - const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = accessBlock->create(acc, 1U); - - for(auto chunkSize : chunkSizes) - { - accessBlock->destroy(acc, pointers[idx]); - pointers[idx] = nullptr; - - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - } - }); - } -}; - -template -auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) -{ - auto workDiv = createWorkDiv(devAcc, numElements, args...); - alpaka::exec(queue, workDiv, args...); - return workDiv; -} - -TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) -{ - using Acc = alpaka::TagToAcc; - auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); - auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); - alpaka::memset(queue, accessBlockBuf, 0x00); - alpaka::wait(queue); - auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); - auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); - auto pointers = createPointers( - devHost, - devAcc, - queue, - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - alpaka::wait(queue); - - SECTION("creates second memory somewhere else.") - { - uint32_t const size = 2U; - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); - } - - SECTION("creates memory of different chunk size in different pages.") - { - customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - alpaka::getPtrNative(chunkSizes.m_onDevice)); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) != pageIndex(accessBlock, pointers.m_onHost[1])); - } - - SECTION("creates partly for insufficient memory with same chunk size.") - { - uint32_t const size = 2U; - auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - - // Okay, so here we start the actual test. The situation is the following: - // There is a single chunk available. - // We try to do two allocations. - // So, we expect one to succeed and one to fail. - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK( - ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) - or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); - } - - SECTION("does not race between clean up and create.") - { - fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - auto freePage = pageIndex(accessBlock, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); - - // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - - alpaka::exec( - queue, - workDivSingleThread, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); - - alpaka::exec( - queue, - workDivSingleThread, - CreateUntilSuccess{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 1U), - chunkSizes.m_onHost[0]); - - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) == freePage); - } - - SECTION("destroys two pointers of different size.") - { - customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - alpaka::getPtrNative(chunkSizes.m_onDevice)); - alpaka::wait(queue); - - customExec( - queue, - devAcc, - 2U, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); - alpaka::wait(queue); - - auto result = makeBuffer(devHost, devAcc, 2U); - customExec( - queue, - devAcc, - 1U, - IsValid{}, - accessBlock, - alpaka::getPtrNative(pointers.m_onDevice), - alpaka::getPtrNative(result.m_onDevice), - result.m_extents[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); - alpaka::wait(queue); - - CHECK(not result.m_onHost[0]); - CHECK(not result.m_onHost[1]); - } - - SECTION("destroys two pointers of same size.") - { - customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - customExec( - queue, - devAcc, - 2U, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); - alpaka::wait(queue); - - auto result = makeBuffer(devHost, devAcc, 2U); - result.m_onHost[0] = true; - result.m_onHost[1] = true; - alpaka::memcpy(queue, result.m_onDevice, result.m_onHost); - alpaka::wait(queue); - customExec( - queue, - devAcc, - 1U, - IsValid{}, - accessBlock, - alpaka::getPtrNative(pointers.m_onDevice), - alpaka::getPtrNative(result.m_onDevice), - result.m_extents[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); - alpaka::wait(queue); - - CHECK(not result.m_onHost[0]); - CHECK(not result.m_onHost[1]); - } - - SECTION("fills up all chunks in parallel and writes to them.") - { - auto content = makeBuffer( - devHost, - devAcc, - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); - std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); - alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); - alpaka::wait(queue); - - auto workDiv = customExec( - queue, - devAcc, - pointers.m_extents[0], - FillAllUpAndWriteToThem{}, - accessBlock, - alpaka::getPtrNative(content.m_onDevice), - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - chunkSizes.m_onHost[0]); - - alpaka::wait(queue); - - auto writtenCorrectly - = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); - CHECK(writtenCorrectly); - } - - SECTION("destroys all pointers simultaneously.") - { - auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - auto const allSlotsOfDifferentSize - = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); - fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - - customExec( - queue, - devAcc, - pointers.m_extents[0], - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - auto result = makeBuffer(devHost, devAcc, pointers.m_extents[0]); - customExec( - queue, - devAcc, - 1U, - IsValid{}, - accessBlock, - alpaka::getPtrNative(pointers.m_onDevice), - alpaka::getPtrNative(result.m_onDevice), - result.m_extents[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); - alpaka::wait(queue); - - std::span tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]); - CHECK(std::none_of(std::cbegin(tmpResults), std::cend(tmpResults), [](auto const val) { return val; })); - - CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); - CHECK( - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) - == allSlotsOfDifferentSize); - } - - SECTION("creates and destroys multiple times.") - { - customExec( - queue, - devAcc, - pointers.m_extents[0], - CreateAndDestroMultipleTimes{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::sort(std::begin(tmpPointers), std::end(tmpPointers)); - CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); - } - - SECTION("can handle oversubscription.") - { - uint32_t oversubscriptionFactor = 2U; - auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - - // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the - // end. - auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); - customExec( - queue, - devAcc, - manyPointers.m_extents[0], - OversubscribedCreation{oversubscriptionFactor, availableSlots}, - accessBlock, - span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); - alpaka::wait(queue); - - // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a - // nullptr. - std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); - auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; - - CHECK(std::all_of( - std::begin(tmpManyPointers), - beginNonNull, - [](auto const pointer) { return pointer == nullptr; })); - - std::sort(beginNonNull, std::end(tmpManyPointers)); - CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); - } - - SECTION("can handle many different chunk sizes.") - { - auto chunkSizes = makeBuffer(devHost, devAcc, pageSize); - std::span chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]); - std::iota(std::begin(chunkSizesSpan), std::end(chunkSizesSpan), 1U); - alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); - alpaka::wait(queue); - - customExec( - queue, - devAcc, - MyAccessBlock::numPages(), - CreateAllChunkSizes{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), MyAccessBlock::numPages()), - std::span(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0])); - - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages()); - std::sort(std::begin(tmpPointers), std::end(tmpPointers)); - CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); - } - - SECTION("creates second memory somewhere in multi-page mode.") - { - uint32_t const size = 2U; - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - pageSize); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); - } - - alpaka::wait(queue); -} diff --git a/thirdParty/mallocMC/tests/thread-safety/BitField.cpp b/thirdParty/mallocMC/tests/thread-safety/BitField.cpp deleted file mode 100644 index f31decc5f7..0000000000 --- a/thirdParty/mallocMC/tests/thread-safety/BitField.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "../unit/mocks.hpp" - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; -using namespace std::chrono_literals; - -// The following test is a particular regression test which (in its current form) requires to be able to stop a -// thread from the outside. This is not possible through the alpaka interface. Thus, we resort to running this with -// `std::jthread` but we have to ensure that the alpaka atomics work. Thus, the ifdef. -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED - -TEST_CASE("Threaded BitMask") -{ - BitMask mask{}; - - SECTION("finds first free bit despite noise.") - { - // This is a regression test. An earlier version of this algorithm used to fail when other parts of the bit - // mask experienced frequent change during the search. We simulate this by letting a "noise thread" toggle - // unimportant bits while a "search thread" tries to find the first free bit. While the noise does not affect - // the result, a previous version of the algorithm does fail under these conditions (as verified by - // experiment). - - uint32_t const firstFreeIndex = GENERATE(0U, 1U, 10U); - for(uint32_t i = 0; i < firstFreeIndex; ++i) - { - mask.set(accSerial, i); - } - - uint32_t result = BitMaskSize; - auto noiseThread = std::jthread( - [&mask, firstFreeIndex](std::stop_token const& stopToken) - { - while(not stopToken.stop_requested()) - { - for(uint32_t i = firstFreeIndex + 1; i < BitMaskSize; ++i) - { - mask.flip(accSerial, i); - } - } - }); - std::thread([&mask, &result]() { result = mask.firstFreeBit(accSerial); }).join(); - std::this_thread::sleep_for(20ms); - CHECK(result == firstFreeIndex); - noiseThread.request_stop(); - } -} -#else -TEST_CASE("Threaded BitMask", "[!shouldfail]") -{ - FAIL("The Threaded BitMask regression test could not run because it is only available with the std::threads " - "backend enabled."); -} -#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED diff --git a/thirdParty/mallocMC/tests/thread-safety/Scatter.cpp b/thirdParty/mallocMC/tests/thread-safety/Scatter.cpp deleted file mode 100644 index 95fc5a7ac6..0000000000 --- a/thirdParty/mallocMC/tests/thread-safety/Scatter.cpp +++ /dev/null @@ -1,859 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - - -#include "mallocMC/creationPolicies/Scatter.hpp" - -#include "../unit/mocks.hpp" -#include "mallocMC/alignmentPolicies/Shrink.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" -#include "mallocMC/device_allocator.hpp" -#include "mallocMC/distributionPolicies/Noop.hpp" -#include "mallocMC/mallocMC_utils.hpp" -#include "mallocMC/oOMPolicies/ReturnNull.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using Dim = alpaka::DimInt<1>; -using Idx = std::uint32_t; - - -constexpr uint32_t pageSize = 1024; -constexpr uint32_t numPages = 4; -// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): -constexpr uint32_t pteSize = 8 + 4 + 4; -constexpr uint32_t blockSize = numPages * (pageSize + pteSize); - -template -struct ScatterHeapConfig -{ - static constexpr uint32_t const accessblocksize = T_blockSize; - static constexpr uint32_t const pagesize = T_pageSize; - static constexpr uint32_t const wastefactor = T_wasteFactor; - static constexpr uint32_t const regionsize = 1U; - static constexpr bool const resetfreedpages = true; -}; - -using MyScatter = mallocMC::CreationPolicies::Scatter< - ScatterHeapConfig>::AlignmentAwarePolicy>; -using MyDeviceAllocator = mallocMC::DeviceAllocator< - MyScatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::AlignmentPolicies::Shrink<>>; - -using std::span; - -// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the -// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen -// regardless of the underlying access optimisations etc. - -struct FillWith -{ - template - ALPAKA_FN_ACC auto operator()( - TAcc const& acc, - auto* accessBlock, - uint32_t const chunkSize, - void** result, - uint32_t const size) const -> void - { - std::generate( - result, - result + size, - [&acc, accessBlock, chunkSize]() - { - void* pointer{nullptr}; - while(pointer == nullptr) - { - pointer = accessBlock->create(acc, chunkSize); - } - return pointer; - }); - } -}; - -struct ContentGenerator -{ - uint32_t counter{0U}; - - ALPAKA_FN_ACC auto operator()() -> uint32_t - { - return counter++; - } -}; - -ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) -{ - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < size) - { - functor(idx); - } - } -} - -struct Create -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); - } - - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const - { - forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); - } -}; - -struct CreateUntilSuccess -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct Destroy -{ - template - ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const - { - forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); - } -}; - -using Host = alpaka::AccCpuSerial; - -template -struct Buffer -{ - TDevAcc m_devAcc; - TDevHost m_devHost; - - alpaka::Vec m_extents; - - alpaka::Buf m_onDevice; - alpaka::Buf m_onHost; - - Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) - : m_devAcc{devAcc} - , m_devHost{devHost} - , m_extents{extents} - , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) - , m_onHost(alpaka::allocBuf(devHost, m_extents)) - { - } -}; - -template -auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) -{ - return Buffer{devHost, devAcc, extents}; -} - -auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) -{ - auto chunkSizes = makeBuffer(devHost, devAcc, 2U); - chunkSizes.m_onHost[0] = 32U; - chunkSizes.m_onHost[1] = 512U; - alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); - return chunkSizes; -} - -auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) -{ - auto pointers = makeBuffer(devHost, devAcc, size); - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); - alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); - return pointers; -} - -template -auto setup() -{ - alpaka::Platform const platformAcc = {}; - alpaka::Platform> const platformHost = {}; - alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); - alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); - alpaka::Queue queue{devAcc}; - return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); -} - -template -auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers -{ - if constexpr(std::is_same_v, alpaka::TagCpuSerial>) - { - return {{1U}, {1U}, {numElements}}; - } - else - { - alpaka::KernelCfg const kernelCfg - = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; - return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); - } -} - -template -auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) -{ - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::exec( - queue, - workDivSingleThread, - FillWith{}, - accessBlock, - chunkSize, - alpaka::getPtrNative(pointers.m_onDevice), - pointers.m_extents[0]); - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); -} - -template -auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) -{ - fillWith(queue, accessBlock, chunkSize, pointers); - auto* pointer1 = pointers.m_onHost[0]; - - // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in - // devPointers, so we don't need to wait for the copy before to finish. - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::exec( - queue, - workDivSingleThread, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); - alpaka::wait(queue); - return pointer1; -} - -template -auto freeAllButOneOnFirstPage(auto& queue, auto* accessBlock, auto& pointers) -{ - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::sort(std::begin(tmp), std::end(tmp)); - // This points to the first chunk of page 0. - auto* pointer1 = tmp[0]; - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); - alpaka::wait(queue); - auto size = pointers.m_extents[0] / numPages - 1; - // Delete all other chunks on page 0. - customExec( - queue, - pointers.m_devAcc, - size, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); - alpaka::wait(queue); - return pointer1; -} - -struct CheckContent -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) - const - { - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < pointers.size()) - { - auto* begin = reinterpret_cast(pointers[idx]); - auto* end = begin + chunkSize / sizeof(uint32_t); - results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); - } - } - } -}; - -template -auto checkContent( - auto& devHost, - auto& devAcc, - auto& queue, - auto& pointers, - auto& content, - auto& workDiv, - auto const chunkSize) -{ - auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); - alpaka::exec( - queue, - workDiv, - CheckContent{}, - alpaka::getPtrNative(content.m_onDevice), - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - alpaka::getPtrNative(results.m_onDevice), - chunkSize); - alpaka::wait(queue); - alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); - alpaka::wait(queue); - - - std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); - auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); - - return writtenCorrectly; -} - -struct GetAvailableSlots -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const - { - *result = accessBlock->getAvailableSlots(acc, chunkSize); - }; -}; - -template -auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) -{ - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - alpaka::wait(queue); - auto result = makeBuffer(devHost, devAcc, 1U); - alpaka::wait(queue); - alpaka::exec( - queue, - workDivSingleThread, - GetAvailableSlots{}, - accessBlock, - chunkSize, - alpaka::getPtrNative(result.m_onDevice)); - alpaka::wait(queue); - alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); - alpaka::wait(queue); - auto tmp = result.m_onHost[0]; - alpaka::wait(queue); - return tmp; -} - -auto pageIndex(auto accessBlock, auto* pointer) -{ - // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). - // But we assume that the access block starts with the first page, so the pointer to the first page equals the - // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. - return mallocMC::indexOf(pointer, alpaka::getPtrNative(accessBlock), pageSize); -} - -struct FillAllUpAndWriteToThem -{ - ALPAKA_FN_ACC auto operator()( - auto const& acc, - auto* accessBlock, - auto* content, - span pointers, - auto chunkSize) const - { - auto const idx0 = alpaka::getIdx(acc)[0]; - auto const numElements = alpaka::getWorkDiv(acc)[0]; - for(uint32_t i = 0; i < numElements; ++i) - { - auto idx = idx0 + i; - if(idx < pointers.size()) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - auto* begin = reinterpret_cast(pointers[idx]); - auto* end = begin + chunkSize / sizeof(uint32_t); - std::fill(begin, end, content[idx]); - } - } - } -}; - -struct CreateAndDestroMultipleTimes -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = nullptr; - for(uint32_t j = 0; j < idx; ++j) - { - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - accessBlock->destroy(acc, pointers[idx]); - pointers[idx] = nullptr; - } - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct OversubscribedCreation -{ - uint32_t oversubscriptionFactor{}; - uint32_t availableSlots{}; - - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = nullptr; - for(uint32_t j = 0; j < idx + 1; ++j) - { - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - - // CAUTION: The following lines have cost us more than a working day of debugging! - // If the hardware you're running on has a single program counter for the whole warp, the whole - // warp can't exit the while loop in case of even a single thread requesting another round. - // This implies that if we move the `.destroy()` out of the while loop, all the slots get - // filled up but the owning threads run idle instead of freeing them up again because they are - // waiting for their last companions to give their okay for exiting the loop. This is, of - // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in - // this scenario). So, this loop deadlocks and no thread ever exits. - // - // ... at least that's what we believe. If you're reading this comment, we might have been - // wrong about this. - if(pointers[idx] != nullptr) - { - accessBlock->destroy(acc, pointers[idx]); - } - } - pointers[idx] = nullptr; - } - - // We only keep some of the memory. In particular, we keep one chunk less than is available, - // such that threads looking for memory after we've finished can still find some. - while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - }); - } -}; - -struct CreateAllChunkSizes -{ - ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) - const - { - forAll( - acc, - pointers.size(), - [&](auto idx) - { - pointers[idx] = accessBlock->create(acc, 1U); - - for(auto chunkSize : chunkSizes) - { - accessBlock->destroy(acc, pointers[idx]); - pointers[idx] = nullptr; - - // `.isValid()` is not thread-safe, so we use this direct assessment: - while(pointers[idx] == nullptr) - { - pointers[idx] = accessBlock->create(acc, chunkSize); - } - } - }); - } -}; - -template -auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) -{ - auto workDiv = createWorkDiv(devAcc, numElements, args...); - alpaka::exec(queue, workDiv, args...); - return workDiv; -} - -TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) -{ - using Acc = alpaka::TagToAcc; - auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); - auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); - auto dataBuf = alpaka::allocBuf, Idx>( - devAcc, - alpaka::Vec{1U}); - MyScatter::initHeap( - devAcc, - queue, - alpaka::getPtrNative(accessBlockBuf), - static_cast(alpaka::getPtrNative(dataBuf)), - blockSize); - alpaka::wait(queue); - auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); - auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); - auto pointers = createPointers( - devHost, - devAcc, - queue, - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - alpaka::wait(queue); - - SECTION("creates second memory somewhere else.") - { - uint32_t const size = 2U; - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); - } - - SECTION("creates memory of different chunk size in different pages.") - { - customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - alpaka::getPtrNative(chunkSizes.m_onDevice)); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) != pageIndex(dataBuf, pointers.m_onHost[1])); - } - - SECTION("creates partly for insufficient memory with same chunk size.") - { - uint32_t const size = 2U; - auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - - // Okay, so here we start the actual test. The situation is the following: - // There is a single chunk available. - // We try to do two allocations. - // So, we expect one to succeed and one to fail. - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK( - ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) - or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); - } - - SECTION("does not race between clean up and create.") - { - fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - auto freePage = pageIndex(dataBuf, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); - - // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. - alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; - - alpaka::exec( - queue, - workDivSingleThread, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); - - alpaka::exec( - queue, - workDivSingleThread, - CreateUntilSuccess{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 1U), - chunkSizes.m_onHost[0]); - - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) == freePage); - } - - SECTION("destroys two pointers of different size.") - { - auto workDiv = customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - alpaka::getPtrNative(chunkSizes.m_onDevice)); - alpaka::wait(queue); - - auto const beforeDestroy0 - = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - auto const beforeDestroy1 - = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); - - alpaka::exec( - queue, - workDiv, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); - alpaka::wait(queue); - - auto const afterDestroy0 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - auto const afterDestroy1 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); - - CHECK(beforeDestroy0 < afterDestroy0); - CHECK(beforeDestroy1 < afterDestroy1); - } - - SECTION("destroys two pointers of same size.") - { - auto workDiv = customExec( - queue, - devAcc, - 2U, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - auto const beforeDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - - alpaka::exec( - queue, - workDiv, - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); - alpaka::wait(queue); - - auto const afterDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - CHECK(beforeDestroy == afterDestroy - 2U); - } - - SECTION("fills up all chunks in parallel and writes to them.") - { - auto content = makeBuffer( - devHost, - devAcc, - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); - std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); - alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); - alpaka::wait(queue); - - auto workDiv = customExec( - queue, - devAcc, - pointers.m_extents[0], - FillAllUpAndWriteToThem{}, - accessBlock, - alpaka::getPtrNative(content.m_onDevice), - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - chunkSizes.m_onHost[0]); - - alpaka::wait(queue); - - auto writtenCorrectly - = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); - CHECK(writtenCorrectly); - } - - SECTION("destroys all pointers simultaneously.") - { - auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - auto const allSlotsOfDifferentSize - = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); - fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); - - customExec( - queue, - devAcc, - pointers.m_extents[0], - Destroy{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); - CHECK( - getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) - == allSlotsOfDifferentSize); - } - - SECTION("creates and destroys multiple times.") - { - customExec( - queue, - devAcc, - pointers.m_extents[0], - CreateAndDestroMultipleTimes{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); - std::sort(std::begin(tmpPointers), std::end(tmpPointers)); - CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); - } - - SECTION("can handle oversubscription.") - { - uint32_t oversubscriptionFactor = 2U; - auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); - - // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the - // end. - auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); - customExec( - queue, - devAcc, - manyPointers.m_extents[0], - OversubscribedCreation{oversubscriptionFactor, availableSlots}, - accessBlock, - span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), - chunkSizes.m_onHost[0]); - alpaka::wait(queue); - - alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); - alpaka::wait(queue); - - // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a - // nullptr. - std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); - auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; - - CHECK(std::all_of( - std::begin(tmpManyPointers), - beginNonNull, - [](auto const pointer) { return pointer == nullptr; })); - - std::sort(beginNonNull, std::end(tmpManyPointers)); - CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); - } - - SECTION("creates second memory somewhere in multi-page mode.") - { - uint32_t const size = 2U; - customExec( - queue, - devAcc, - size, - Create{}, - accessBlock, - span(alpaka::getPtrNative(pointers.m_onDevice), size), - pageSize); - alpaka::wait(queue); - - alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); - alpaka::wait(queue); - - CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); - } - - alpaka::wait(queue); -} diff --git a/thirdParty/mallocMC/tests/unit/AccessBlock.cpp b/thirdParty/mallocMC/tests/unit/AccessBlock.cpp deleted file mode 100644 index d44a5fb4ef..0000000000 --- a/thirdParty/mallocMC/tests/unit/AccessBlock.cpp +++ /dev/null @@ -1,532 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz, Rene Widera - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" - -#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" -#include "mallocMC/mallocMC_utils.hpp" -#include "mocks.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -template -struct TestableAccessBlock - : mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock -{ -public: - TestableAccessBlock() = default; - using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::blockSize; - using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::pageSize; - using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::wasteFactor; - using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock:: - resetfreedpages; -}; - -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskStorageType; -using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; - -constexpr uint32_t const pageTableEntrySize = 8U; -constexpr uint32_t const pageSize1 = 1024U; -constexpr uint32_t const pageSize2 = 4096U; - -using AccessBlocks = std::tuple< - TestableAccessBlock, AlignmentPolicy>, - TestableAccessBlock, AlignmentPolicy>, - TestableAccessBlock, AlignmentPolicy>, - TestableAccessBlock, AlignmentPolicy>>; - -template -auto fillWith(TestableAccessBlock& accessBlock, uint32_t const chunkSize) - -> std::vector -{ - std::vector pointers(accessBlock.getAvailableSlots(accSerial, chunkSize)); - std::generate( - std::begin(pointers), - std::end(pointers), - [&accessBlock, chunkSize]() - { - void* pointer = accessBlock.create(accSerial, chunkSize); - REQUIRE(pointer != nullptr); - return pointer; - }); - return pointers; -} - -template -struct SelectivelyWastedHeapConfig : HeapConfig -{ - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( - auto const& /*acc*/, - uint32_t const chunkSize, - uint32_t const numBytes) - { - auto currentWasteFactor = (numBytes == T_allowedToWasteNumBytes) ? T_wasteFactor : 1U; - return (chunkSize >= numBytes && chunkSize <= currentWasteFactor * numBytes); - } -}; - -TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks) -{ - using AccessBlock = TestType; - constexpr auto const blockSize = AccessBlock::blockSize; - constexpr auto const pageSize = AccessBlock::pageSize; - - AccessBlock accessBlock{}; - - SECTION("knows its number of pages.") - { - // The overhead from the metadata is small enough that this just happens to round down to the correct values. - // If you choose weird numbers, it might no longer. - CHECK(accessBlock.numPages() == blockSize / pageSize); - } - - SECTION("knows its available slots.") - { - uint32_t const chunkSize = GENERATE(1U, 2U, 32U, 57U, 1024U); - // This is not exactly true. It is only true because the largest chunk size we chose above is exactly the size - // of one page. In general, this number would be fractional for larger than page size chunks but I don't want - // to bother right now: - uint32_t slotsPerPage = chunkSize < pageSize ? PageInterpretation::numChunks(chunkSize) : 1U; - - uint32_t numOccupied = GENERATE(0U, 1U, 10U); - uint32_t actualNumOccupied = numOccupied; - for(uint32_t i = 0; i < numOccupied; ++i) - { - if(accessBlock.create(accSerial, chunkSize) == nullptr) - { - actualNumOccupied--; - } - } - - auto totalSlots = accessBlock.numPages() * slotsPerPage; - if(totalSlots > actualNumOccupied) - { - CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == totalSlots - actualNumOccupied); - } - else - { - CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == 0U); - } - } - - constexpr uint32_t const chunkSize = 32U; - - SECTION("creates") - { - SECTION("no nullptr if memory is available.") - { - // This is not a particularly hard thing to do because any uninitialised pointer that could be returned is - // most likely not exactly the nullptr. We just leave this in as it currently doesn't hurt anybody to keep - // it. - CHECK(accessBlock.create(accSerial, chunkSize) != nullptr); - } - - SECTION("memory that can be written to and read from.") - { - uint32_t const arbitraryValue = 42; - auto* ptr = static_cast(accessBlock.create(accSerial, chunkSize)); - REQUIRE(ptr != nullptr); - *ptr = arbitraryValue; - CHECK(*ptr == arbitraryValue); - } - - SECTION("second memory somewhere else.") - { - CHECK(accessBlock.create(accSerial, chunkSize) != accessBlock.create(accSerial, chunkSize)); - } - - SECTION("memory of different chunk size in different pages.") - { - constexpr uint32_t const chunkSize2 = 512U; - REQUIRE(chunkSize != chunkSize2); - // To be precise, the second call will actually return a nullptr if there is only a single page (which is - // one of the test cases at the time of writing). But that technically passes this test, too. - - CHECK( - accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize)) - != accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize2))); - } - - SECTION("nullptr if there's no page with fitting chunk size") - { - // This requests one chunk of a different chunk size for each page. As a new page is required each time, - // all pages have a chunk size set at the end. And none of those is `chunkSize`. - for(uint32_t index = 0; index < accessBlock.numPages(); ++index) - { - auto const differentChunkSize = chunkSize + 1U + index; - REQUIRE(chunkSize != differentChunkSize); - accessBlock.create(accSerial, differentChunkSize); - } - - CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); - } - - SECTION("nullptr if all pages have full filling level.") - { - fillWith(accessBlock, chunkSize); - CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); - } - - SECTION("last remaining chunk.") - { - auto pointers = fillWith(accessBlock, chunkSize); - uint32_t const index = GENERATE(0U, 1U, 42U); - void* pointer = pointers[std::min(index, static_cast(pointers.size()) - 1)]; - accessBlock.destroy(accSerial, pointer); - CHECK(accessBlock.create(accSerial, chunkSize) == pointer); - } - - SECTION("memory larger than page size.") - { - if(accessBlock.numPages() >= 2U) - { - CHECK(accessBlock.isValid(accSerial, accessBlock.create(accSerial, 2U * pageSize))); - } - } - - SECTION("nullptr if chunkSize is larger than total available memory in pages.") - { - // larger than the available memory but in some cases smaller than the block size even after subtracting - // the space for the page table: - uint32_t const excessiveChunkSize = accessBlock.numPages() * pageSize + 1U; - CHECK(accessBlock.create(accSerial, excessiveChunkSize) == nullptr); - } - - SECTION("in the correct place for larger than page size.") - { - // we want to allocate 2 pages: - if(accessBlock.numPages() > 1U) - { - auto pointers = fillWith(accessBlock, pageSize); - std::sort(std::begin(pointers), std::end(pointers)); - - // Now, we free two contiguous chunks such that there is one deterministic spot wherefrom our request - // can be served. - uint32_t index = GENERATE(0U, 1U, 5U); - index = std::min(index, static_cast(pointers.size()) - 2U); - accessBlock.destroy(accSerial, pointers[index]); - accessBlock.destroy(accSerial, pointers[index + 1]); - - // Must be exactly where we free'd the pages: - CHECK( - accessBlock.pageIndex(accessBlock.create(accSerial, 2U * pageSize)) - == static_cast(index)); - } - } - - SECTION("a pointer and knows it's valid afterwards.") - { - void* pointer = accessBlock.create(accSerial, chunkSize); - CHECK(accessBlock.isValid(accSerial, pointer)); - } - - SECTION("the last pointer in page and its allocation does not reach into the bit field.") - { - auto slots = accessBlock.getAvailableSlots(accSerial, chunkSize); - // Find the last allocation on the first page: - auto pointers = fillWith(accessBlock, chunkSize); - std::sort(std::begin(pointers), std::end(pointers)); - auto lastOfPage0 = pointers[slots / accessBlock.numPages() - 1]; - - // Free the first bit of the bit field by destroying the first allocation in the first page: - accessBlock.destroy(accSerial, pointers[0]); - REQUIRE(not accessBlock.isValid(accSerial, pointers[0])); - - // Write all ones to the last of the first page: If there is an overlap between the region of the last - // chunk and the bit field, our recently free'd first chunk will have its bit set by this operation. - char* begin = reinterpret_cast(lastOfPage0); - auto* end = begin + chunkSize; - std::fill(begin, end, 255U); - - // Now, we try to allocate one more chunk. It must be the one we free'd before. - CHECK(accessBlock.create(accSerial, chunkSize) == pointers[0]); - REQUIRE(accessBlock.isValid(accSerial, pointers[0])); - } - - SECTION("and writes something very close to page size.") - { - // This is a regression test. The original version of the code started to use multi-page mode when numBytes - // >= pageSize. That is too late because if we're not in multi-page mode, we need to leave some space for - // the bit mask. Thus, the following test would corrupt the bit mask, if we were to allocate this in - // chunked mode. - -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - REQUIRE(sizeof(BitMaskStorageType<>) > 1U); - auto localChunkSize = pageSize - 1U; - auto slots = accessBlock.getAvailableSlots(accSerial, localChunkSize); - auto pointer = accessBlock.create(accSerial, localChunkSize); - REQUIRE(slots == accessBlock.getAvailableSlots(accSerial, localChunkSize) + 1); - memset(pointer, 0, localChunkSize); - CHECK_NOTHROW(accessBlock.destroy(accSerial, pointer)); -#else - SUCCEED("This bug actually never had any observable behaviour in NDEBUG mode because the corrupted bit " - "mask is never read again."); -#endif // NDEBUG - } - - SECTION("with waste factor") - { - constexpr uint32_t const wastefactor = 3U; - TestableAccessBlock, AlignmentPolicy> wastedAccessBlock{}; - auto pointers = fillWith(wastedAccessBlock, chunkSize); - - auto smallerChunkSize = chunkSize / (wastefactor - 1U); - REQUIRE(smallerChunkSize < chunkSize); - - wastedAccessBlock.destroy(accSerial, pointers[0]); - - // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no - // available memory for this chunk size. - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, smallerChunkSize) - == 0U); - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->create(accSerial, smallerChunkSize) == nullptr); - - SECTION("knows its available slots.") - { - CHECK(wastedAccessBlock.getAvailableSlots(accSerial, smallerChunkSize) == 1U); - } - - SECTION("creates a smaller chunk size.") - { - CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); - } - - SECTION("fails to create too many smaller chunks.") - { - CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); - CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == nullptr); - } - - SECTION("is not misled by mixing above and below multi-page threshold.") - { - auto const aboveMultiPageThreshold = pageSize - 2 * sizeof(BitMaskStorageType<>); - auto const belowMultiPageThreshold = aboveMultiPageThreshold / (wastefactor - 1U); - for(auto const pointer : pointers) - { - // free one page we want to operate on - if(wastedAccessBlock.isValid(accSerial, pointer) and wastedAccessBlock.pageIndex(pointer) == 0U) - { - wastedAccessBlock.destroy(accSerial, pointer); - } - } - REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, belowMultiPageThreshold) == 2U); - REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, aboveMultiPageThreshold) == 1U); - - // This allocates in multi-page mode. - CHECK(wastedAccessBlock.pageIndex(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold)) == 0U); - // This tries to allocate in chunked mode but the waste factor would allow to create on the just - // allocated page. This is, of course, forbidden. - CHECK(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold) == nullptr); - } - } - - SECTION("with waste function") - { - constexpr uint32_t const wastefactor = 3U; - constexpr uint32_t const selectedNumBytes = mallocMC::ceilingDivision(chunkSize, wastefactor); - TestableAccessBlock< - SelectivelyWastedHeapConfig, - AlignmentPolicy> - wastedAccessBlock{}; - auto pointers = fillWith(wastedAccessBlock, chunkSize); - - auto notSelectedNumBytes = chunkSize / (wastefactor - 1U); - - // Okay, so we want a scenario where both selectedNumBytes and notSelectedNumBytes are within the range of - // the waste factor but only for selectedNumBytes we'll actually get a waste-factor-like behaviour. - REQUIRE(selectedNumBytes < chunkSize); - REQUIRE(selectedNumBytes * wastefactor >= chunkSize); - REQUIRE(selectedNumBytes < notSelectedNumBytes); - REQUIRE(notSelectedNumBytes < chunkSize); - - wastedAccessBlock.destroy(accSerial, pointers[0]); - - // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no - // available memory for these chunk sizes. - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, notSelectedNumBytes) - == 0U); - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, selectedNumBytes) - == 0U); - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->create(accSerial, selectedNumBytes) == nullptr); - REQUIRE( - reinterpret_cast(&wastedAccessBlock)->create(accSerial, notSelectedNumBytes) == nullptr); - - SECTION("knows its available slots.") - { - CHECK(wastedAccessBlock.getAvailableSlots(accSerial, selectedNumBytes) == 1U); - CHECK(wastedAccessBlock.getAvailableSlots(accSerial, notSelectedNumBytes) == 0U); - } - - SECTION("creates a smaller chunk size.") - { - CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); - CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); - } - - SECTION("fails to create too many smaller chunks.") - { - CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); - CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); - CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); - CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == nullptr); - } - } - } - - SECTION("destroys") - { - void* pointer = accessBlock.create(accSerial, chunkSize); - REQUIRE(accessBlock.isValid(accSerial, pointer)); - - SECTION("a pointer thereby invalidating it.") - { - accessBlock.destroy(accSerial, pointer); - CHECK(not accessBlock.isValid(accSerial, pointer)); - } - - SECTION("the whole page if last pointer is destroyed.") - { - REQUIRE(chunkSize != pageSize); - REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 1); - accessBlock.destroy(accSerial, pointer); - CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); - } - - SECTION("not the whole page if there still exists a valid pointer.") - { - REQUIRE(chunkSize != pageSize); - auto unOccupiedPages = accessBlock.numPages(); - void* newPointer{nullptr}; - // We can't be sure which page is used for any allocation, so we allocate again and again until we have hit - // a page that already has an allocation: - while(accessBlock.getAvailableSlots(accSerial, pageSize) != unOccupiedPages) - { - unOccupiedPages = accessBlock.getAvailableSlots(accSerial, pageSize); - newPointer = accessBlock.create(accSerial, chunkSize); - } - accessBlock.destroy(accSerial, newPointer); - CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == unOccupiedPages); - } - - SECTION("one slot without touching the others.") - { - // this won't be touched: - accessBlock.create(accSerial, chunkSize); - auto originalSlots = accessBlock.getAvailableSlots(accSerial, chunkSize); - accessBlock.destroy(accSerial, pointer); - CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == originalSlots + 1U); - } - - SECTION("no invalid pointer but throws instead.") - { -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - pointer = nullptr; - CHECK_THROWS( - accessBlock.destroy(accSerial, pointer), - std::runtime_error{"Attempted to destroy an invalid pointer!"}); -#endif // NDEBUG - } - - SECTION("pointer for larger than page size") - { - if(accessBlock.numPages() > 1U) - { - accessBlock.destroy(accSerial, pointer); - REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); - - pointer = accessBlock.create(accSerial, 2U * pageSize); - REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 2); - REQUIRE(accessBlock.isValid(accSerial, pointer)); - - accessBlock.destroy(accSerial, pointer); - - SECTION("thereby invalidating it.") - { - CHECK(not accessBlock.isValid(accSerial, pointer)); - } - - SECTION("thereby freeing up their pages.") - { - CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); - } - } - } - - SECTION("and doesn't reset the page.") - { - auto& unresettingAccessBlock = *reinterpret_cast< - TestableAccessBlock, AlignmentPolicy>*>( - &accessBlock); - auto const differentChunkSize = GENERATE(17, 2048); - REQUIRE(differentChunkSize != chunkSize); - auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); - - unresettingAccessBlock.destroy(accSerial, pointer); - CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); - } - - SECTION("and always resets the page for larger than page size.") - { - auto& unresettingAccessBlock = *reinterpret_cast< - TestableAccessBlock, AlignmentPolicy>*>( - &accessBlock); - auto const differentChunkSize = GENERATE(17, 2048); - auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); - auto* largePointer = accessBlock.create(accSerial, pageSize); - if(largePointer != nullptr) - { - REQUIRE(differentChunkSize != chunkSize); - - unresettingAccessBlock.destroy(accSerial, largePointer); - CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); - } - } - } -} diff --git a/thirdParty/mallocMC/tests/unit/BitField.cpp b/thirdParty/mallocMC/tests/unit/BitField.cpp deleted file mode 100644 index e791289156..0000000000 --- a/thirdParty/mallocMC/tests/unit/BitField.cpp +++ /dev/null @@ -1,247 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/mallocMC_utils.hpp" -#include "mocks.hpp" - -#include -#include -#include - -#include -#include -#include -#include - -#include - -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitFieldFlatImpl; -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskImpl; - -using BitMaskSizes = std::tuple< - std::integral_constant, // NOLINT(*magic-number*) - std::integral_constant, // NOLINT(*magic-number*) - std::integral_constant>; // NOLINT(*magic-number*) - -TEMPLATE_LIST_TEST_CASE("BitMask", "", BitMaskSizes) -{ - constexpr uint32_t const BitMaskSize = TestType::value; - using BitMask = BitMaskImpl; - BitMask mask{}; - - SECTION("is initialised to 0.") - { - CHECK(mask == 0U); - } - - SECTION("can have individual bits read.") - { - for(uint32_t i = 0; i < BitMaskSize; ++i) - { - CHECK(mask(accSerial, i) == false); - } - } - - SECTION("allows to write individual bits.") - { - for(uint32_t i = 0; i < BitMaskSize; ++i) - { - mask.set(accSerial, i); - CHECK(mask(accSerial, i)); - } - } - - SECTION("allows to unset individual bits afterwards.") - { - for(uint32_t i = 0; i < BitMaskSize; ++i) - { - mask.set(accSerial, i); - for(uint32_t j = 0; j < BitMaskSize; ++j) - { - CHECK(mask(accSerial, j) == (i == j)); - } - mask.unset(accSerial, i); - } - } - - - SECTION("knows the first free bit.") - { - mask.flip(accSerial); - uint32_t const index = GENERATE(0, 3); - mask.flip(accSerial, index); - CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == index); - } - - SECTION("returns BitMaskSize as first free bit if there is none.") - { - mask.flip(accSerial); - CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == BitMaskSize); - } - - SECTION("knows the first free bit with startIndex.") - { - mask.set(accSerial); - uint32_t index1 = GENERATE(0, 5); - uint32_t index2 = GENERATE(0, 11); - if(index1 > index2) - { - std::swap(index1, index2); - } - uint32_t const startIndex = GENERATE(0, 4, 5, 6); - mask.unset(accSerial, index1); - mask.unset(accSerial, index2); - // This is the currently implemented algorithm and could be considered overspecifying the result. - // The minimal requirement we should have is that firstFreeBit is an element of {index1, index2}. - CHECK(mask.firstFreeBit(accSerial, BitMaskSize, startIndex) == ((startIndex == index2) ? index2 : index1)); - } -} - -TEMPLATE_LIST_TEST_CASE("BitFieldFlat", "", BitMaskSizes) -{ - constexpr uint32_t const BitMaskSize = TestType::value; - using BitMask = BitMaskImpl; - using BitFieldFlat = BitFieldFlatImpl; - - // This is potentially larger than we actually need but that's okay: - constexpr uint32_t const numChunks = 256U; - constexpr uint32_t const numMasks = mallocMC::ceilingDivision(numChunks, BitMaskSize); - BitMask data[numMasks]; - - SECTION("knows its only free bit.") - { - uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); - for(auto& mask : data) - { - mask.set(accSerial); - } - data[index / BitMaskSize].unset(accSerial, index % BitMaskSize); - - // Just to be sure: The masks look as expected. - for(uint32_t j = 0; j < numMasks; ++j) - { - for(uint32_t i = 0; i < BitMaskSize; ++i) - { - REQUIRE(data[j](accSerial, i) == (j * BitMaskSize + i != index)); - } - } - - BitFieldFlat field{data}; - - CHECK(field.firstFreeBit(accSerial, numChunks) == index); - } - - SECTION("knows a free bit if later ones are free, too.") - { - uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); - for(auto& mask : std::span{static_cast(data), index / BitMaskSize}) - { - mask.set(accSerial); - } - for(uint32_t i = 0; i < index % BitMaskSize; ++i) - { - data[index / BitMaskSize].set(accSerial, i); - } - - BitFieldFlat field{data}; - - CHECK(field.firstFreeBit(accSerial, numChunks) >= index); - } - - SECTION("knows its first free bit for different numChunks.") - { - auto localNumChunks = numChunks / GENERATE(1, 2, 3); - std::span localData{static_cast(data), mallocMC::ceilingDivision(localNumChunks, BitMaskSize)}; - uint32_t const index = GENERATE(0, 1, 10, 12); - for(auto& mask : localData) - { - mask.set(accSerial); - } - localData[index / BitMaskSize].unset(accSerial, index % BitMaskSize); - - BitFieldFlat field{localData}; - - CHECK(field.firstFreeBit(accSerial, numChunks) == index); - } - - SECTION("sets a bit.") - { - BitFieldFlat field{data}; - uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); - field.set(accSerial, index); - for(uint32_t i = 0; i < numChunks; ++i) - { - CHECK(field.get(accSerial, i) == (i == index)); - } - } - - SECTION("sets two bits.") - { - BitFieldFlat field{data}; - uint32_t const firstIndex = GENERATE(0, 1, numChunks / 2, numChunks - 1); - uint32_t const secondIndex = GENERATE(2, numChunks / 3, numChunks / 2, numChunks - 1); - field.set(accSerial, firstIndex); - field.set(accSerial, secondIndex); - for(uint32_t i = 0; i < numChunks; ++i) - { - CHECK(field.get(accSerial, i) == (i == firstIndex || i == secondIndex)); - } - } - - SECTION("returns numChunks if no free bit is found.") - { - BitFieldFlat field{data}; - for(uint32_t i = 0; i < numChunks; ++i) - { - field.set(accSerial, i); - } - CHECK(field.firstFreeBit(accSerial, numChunks) == numChunks); - } - - SECTION("returns numChunks if free bit is not valid.") - { - BitFieldFlat field{data}; - uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); - for(uint32_t i = 0; i < numValidBits; ++i) - { - // We are filling up all valid bits. - field.set(accSerial, i); - } - CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); - } - - SECTION("returns numChunks if free bit is not valid.") - { - BitFieldFlat field{data}; - uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); - for(uint32_t i = 0; i < numValidBits; ++i) - { - // We are filling up all valid bits. - field.set(accSerial, i); - } - CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); - } -} diff --git a/thirdParty/mallocMC/tests/unit/PageInterpretation.cpp b/thirdParty/mallocMC/tests/unit/PageInterpretation.cpp deleted file mode 100644 index 21d6c5983d..0000000000 --- a/thirdParty/mallocMC/tests/unit/PageInterpretation.cpp +++ /dev/null @@ -1,316 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" - -#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" -#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" -#include "mallocMC/mallocMC_utils.hpp" -#include "mocks.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; -using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; -using mallocMC::CreationPolicies::FlatterScatterAlloc::DataPage; -using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; -using std::distance; - -template -constexpr std::array const - chunkSizesForReportingTests{1, 2, 4, 5, 10, 11, 31, 32, 512}; // NOLINT(*magic-number*) - -template -constexpr std::array const expectedNumChunksForReportingTests{}; - -template<> -constexpr std::array const - expectedNumChunksForReportingTests<32U>{908, 480, 248, 199, 100, 92, 32, 31, 1}; // NOLINT(*magic-number*) - -template<> -constexpr std::array const - expectedNumChunksForReportingTests<64U>{904, 480, 248, 198, 100, 91, 32, 31, 1}; // NOLINT(*magic-number*) - -TEST_CASE("PageInterpretation") -{ - constexpr uint32_t const pageSize = 1024U; - constexpr uint32_t const chunkSize = 32U; - DataPage data{}; - PageInterpretation page{data, chunkSize}; - - SECTION("refers to the same data it was created with.") - { - CHECK(&data == page.chunkPointer(0)); - } - - SECTION("returns start of data as first chunk.") - { - CHECK(page.chunkPointer(0) == &data); - } - - SECTION("computes correct number of chunks.") - { - for(uint32_t i = 0U; i < chunkSizesForReportingTests.size(); ++i) - { - CHECK( - PageInterpretation::numChunks(chunkSizesForReportingTests[i]) - == expectedNumChunksForReportingTests[i]); - } - } - - SECTION("jumps by chunkSize between indices.") - { - for(auto i = 0U; i < (pageSize / chunkSize) - 1; ++i) - { - CHECK( - distance( - reinterpret_cast(page.chunkPointer(i)), - reinterpret_cast(page.chunkPointer(i + 1))) - == chunkSize); - } - } - - SECTION("knows the maximal bit field size.") - { - CHECK( - page.maxBitFieldSize() - == mallocMC::ceilingDivision(PageInterpretation::numChunks(1U), BitMaskSize) - * sizeof(BitMask)); - CHECK( - PageInterpretation::maxBitFieldSize() - == mallocMC::ceilingDivision(PageInterpretation::numChunks(32U), BitMaskSize) - * sizeof(BitMask)); - CHECK( - PageInterpretation::maxBitFieldSize() - == mallocMC::ceilingDivision(PageInterpretation::numChunks(16U), BitMaskSize) - * sizeof(BitMask)); - CHECK( - PageInterpretation::maxBitFieldSize() - == mallocMC::ceilingDivision(PageInterpretation::numChunks(17U), BitMaskSize) - * sizeof(BitMask)); - } - - SECTION("reports numChunks that fit the page.") - { - CHECK( - page.numChunks() * chunkSize - + static_cast(mallocMC::ceilingDivision(page.numChunks(), BitMaskSize) * sizeof(BitMask)) - <= pageSize); - } - - SECTION("knows correct bit field size.") - { - uint32_t const numChunks = GENERATE(2, BitMaskSize - 1, BitMaskSize, 2 * BitMaskSize); - uint32_t localChunkSize = pageSize / numChunks; - PageInterpretation localPage{data, localChunkSize}; - CHECK(localPage.bitFieldSize() == sizeof(BitMask) * mallocMC::ceilingDivision(numChunks, BitMaskSize)); - } -} - -TEST_CASE("PageInterpretation.create") -{ - // Such that we can fit up to four levels of hierarchy in there: - constexpr uint32_t const pageSize - = BitMaskSize * BitMaskSize * BitMaskSize + static_cast(BitMaskSize * sizeof(BitMask)); - // This might be a lot of memory up to a typical stack's size. Let's save us some trouble and create it on the - // heap. - auto actualData = std::make_unique>(); - DataPage& data{*actualData}; - - uint32_t numChunks = GENERATE(BitMaskSize, BitMaskSize * BitMaskSize); - // CAUTION: Only works for full bit masks: - uint32_t chunkSize = (pageSize - (numChunks / BitMaskSize) * sizeof(BitMask)) / numChunks; - PageInterpretation page{data, chunkSize}; - - SECTION("returns a pointer to within the data.") - { - auto* pointer = page.create(accSerial); - CHECK( - std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) - < std::distance( - reinterpret_cast(page.chunkPointer(0)), - reinterpret_cast(page.bitFieldStart()))); - } - - SECTION("returns a pointer to the start of a chunk.") - { - auto* pointer = page.create(accSerial); - CHECK( - std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) % chunkSize - == 0U); - } - - SECTION("returns nullptr if everything is full.") - { - for(auto& mask : page.bitField()) - { - mask.set(accSerial); - } - auto* pointer = page.create(accSerial); - CHECK(pointer == nullptr); - } - - SECTION("can provide numChunks pieces of memory and returns nullptr afterwards.") - { - for(uint32_t i = 0; i < page.numChunks(); ++i) - { - auto* pointer = page.create(accSerial); - CHECK(pointer != nullptr); - } - auto* pointer = page.create(accSerial); - CHECK(pointer == nullptr); - } - - SECTION("updates bit field.") - { - BitMask& mask{page.bitField().getMask(0)}; - REQUIRE(mask.none()); - auto* pointer = page.create(accSerial); - auto const index = page.chunkNumberOf(pointer); - CHECK(mask(accSerial, index)); - } -} - -TEST_CASE("PageInterpretation.destroy") -{ - // Such that we can fit up to four levels of hierarchy in there: - constexpr uint32_t const pageSize - = BitMaskSize * BitMaskSize * BitMaskSize * BitMaskSize - + BitMaskSize * BitMaskSize * BitMaskSize * static_cast(sizeof(BitMask)); - // This is more than 8MB which is a typical stack's size. Let's save us some trouble and create it on the heap. - std::unique_ptr> actualData{new DataPage}; - DataPage& data{*actualData}; - - uint32_t numChunks = GENERATE(BitMaskSize * BitMaskSize, BitMaskSize); - uint32_t chunkSize = pageSize / numChunks; - PageInterpretation page{data, chunkSize}; - auto* pointer = page.create(accSerial); - -#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) - SECTION("throws if given an invalid pointer.") - { - pointer = nullptr; - CHECK_THROWS( - page.destroy(accSerial, pointer), - throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " - "to a valid chunk or it is not marked as allocated."}); - } - - SECTION("allows pointers to anywhere in the chunk.") - { - // This test documents the state as is. We haven't defined this outcome as a requirement but if we change - // it, we might still want to be aware of this because users might want to be informed. - pointer = reinterpret_cast(reinterpret_cast(pointer) + chunkSize / 2); - CHECK_NOTHROW(page.destroy(accSerial, pointer)); - } -#endif // NDEBUG - - SECTION("only ever unsets (and never sets) bits in top-level bit mask.") - { - // We extract the position of the mask before destroying the pointer because technically speaking the whole - // concept of a mask doesn't apply anymore after that pointer was destroyed because that will automatically - // free the page. - auto mask = page.bitField().getMask(0); - auto value = mask; - page.destroy(accSerial, pointer); - CHECK(mask <= value); - } - - - SECTION("cleans up in bit field region of page") - { - // This is larger than any thread would be allowed to write. Threads would only write in the region up to - // `page.numChunks() * chunkSize` not up until `pageSize`. We still do that to have a better overview over - // what was actually deleted. - memset(std::begin(data.data), std::numeric_limits::max(), pageSize); - - uint32_t maxBitFieldSize = 0U; - uint32_t uncleanedSize = 0U; - SECTION("without explicit minimal chunk size") - { - maxBitFieldSize = page.maxBitFieldSize(); // NOLINT(*static*) - - SECTION("fully.") - { - uncleanedSize = 0U; - page.cleanupFull(); - } - - SECTION("only unused.") - { - uncleanedSize = page.bitFieldSize(); - page.cleanupUnused(); - } - } - - SECTION("with explicit minimal chunk size") - { - auto* localPage = reinterpret_cast*>(&page); // NOLINT(*magic-number*) - maxBitFieldSize = localPage->maxBitFieldSize(); // NOLINT(*static*) - - SECTION("fully.") - { - uncleanedSize = 0U; - localPage->cleanupFull(); - } - - SECTION("only unused.") - { - uncleanedSize = localPage->bitFieldSize(); - localPage->cleanupUnused(); - } - } - - for(uint32_t i = 0; i < pageSize; ++i) - { - CHECK( - data.data[i] - == ((i < pageSize - maxBitFieldSize) or (i >= pageSize - uncleanedSize) - ? std::numeric_limits::max() - : 0)); - } - } -} - -// NOLINTEND(*widening*) diff --git a/thirdParty/mallocMC/tests/unit/PageTable.cpp b/thirdParty/mallocMC/tests/unit/PageTable.cpp deleted file mode 100644 index b0ea806cd7..0000000000 --- a/thirdParty/mallocMC/tests/unit/PageTable.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" - -#include - -using mallocMC::CreationPolicies::FlatterScatterAlloc::PageTable; - -constexpr uint32_t const numPages = 3; - -TEST_CASE("PageTable") -{ - PageTable pageTable{}; - - SECTION("initialises chunk sizes to 0.") - { - for(auto const& chunkSize : pageTable.chunkSizes) - { - CHECK(chunkSize == 0U); - } - } - - SECTION("initialises filling levels to 0.") - { - for(auto const& fillingLevel : pageTable.fillingLevels) - { - CHECK(fillingLevel == 0U); - } - } -} diff --git a/thirdParty/mallocMC/tests/unit/mocks.hpp b/thirdParty/mallocMC/tests/unit/mocks.hpp deleted file mode 100644 index b1764d1302..0000000000 --- a/thirdParty/mallocMC/tests/unit/mocks.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Julian Johannes Lenz - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -// This is very hacky: AccCpuSerial (and in general all Accellerators) are very reluctant to be instantiated, so we do -// it the oldschool way and simply malloc some memory pretending to be that accellerator. Let's hope that null-ing it -// is a valid initialisation. The final class only has one mutable data member, so that's probably not half bad but I -// didn't go through all those hundreds of base classes. Usually, we only need the time anyways. -inline auto constructAcc() -{ - using Acc = alpaka::AccCpuSerial, size_t>; - void* myPointer = malloc(sizeof(Acc)); - memset(myPointer, 0U, sizeof(Acc)); - return static_cast(myPointer); -} - -// -static inline auto const accPointer = constructAcc(); -static inline auto const& accSerial = *accPointer; - -template -struct HeapConfig -{ - static constexpr auto const accessblocksize = T_blockSize; - static constexpr auto const pagesize = T_pageSize; - static constexpr auto const wastefactor = T_wasteFactor; - static constexpr auto const resetfreedpages = T_resetfreedpages; - - ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( - auto const& /*acc*/, - uint32_t const chunkSize, - uint32_t const numBytes) - { - return (chunkSize >= numBytes && chunkSize <= T_wasteFactor * numBytes); - } -}; - -struct AlignmentPolicy -{ - struct Properties - { - static constexpr uint32_t const dataAlignment = 1U; - }; -}; From 6d77d8c48ea850516f4ffbf1160e42d56da390ea Mon Sep 17 00:00:00 2001 From: Third Party Date: Thu, 20 Feb 2025 16:13:14 +0100 Subject: [PATCH 2/4] Squashed 'thirdParty/mallocMC/' content from commit c63b20ba73 git-subtree-dir: thirdParty/mallocMC git-subtree-split: c63b20ba7388d5d0857259cb992179b01990db24 --- .clang-format | 165 ++ .clang-tidy | 13 + .github/workflows/ci.yml | 46 + .gitignore | 37 + .pre-commit-config.yaml | 40 + .yamllint | 6 + .zenodo.json | 64 + CHANGELOG.md | 220 +++ CMakeLists.txt | 121 ++ CONTRIBUTING.md | 20 + INSTALL.md | 71 + LICENSE | 40 + README.md | 89 ++ Usage.md | 162 ++ cmake/CPM_0.40.2.cmake | 1280 +++++++++++++++ cmake/add_controlled.cmake | 75 + cmake/package-lock.cmake | 47 + cmake/tools.cmake | 74 + examples/CMakeLists.txt | 33 + examples/getAvailableSlots/CMakeLists.txt | 35 + examples/getAvailableSlots/source/main.cpp | 154 ++ examples/native-cuda/CMakeLists.txt | 31 + examples/native-cuda/source/main.cu | 104 ++ examples/vectorAdd/CMakeLists.txt | 36 + examples/vectorAdd/source/main.cpp | 249 +++ include/mallocMC/alignmentPolicies/Noop.hpp | 69 + include/mallocMC/alignmentPolicies/Shrink.hpp | 151 ++ include/mallocMC/allocator.hpp | 242 +++ .../creationPolicies/FlatterScatter.hpp | 495 ++++++ .../FlatterScatter/AccessBlock.hpp | 858 ++++++++++ .../FlatterScatter/BitField.hpp | 533 ++++++ .../FlatterScatter/DataPage.hpp | 42 + .../FlatterScatter/PageInterpretation.hpp | 344 ++++ .../FlatterScatter/wrappingLoop.hpp | 73 + .../creationPolicies/GallatinCuda.hpp | 178 +++ .../mallocMC/creationPolicies/OldMalloc.hpp | 92 ++ include/mallocMC/creationPolicies/Scatter.hpp | 1422 +++++++++++++++++ include/mallocMC/device_allocator.hpp | 122 ++ .../mallocMC/distributionPolicies/Noop.hpp | 77 + .../distributionPolicies/XMallocSIMD.hpp | 194 +++ include/mallocMC/mallocMC.cuh | 184 +++ include/mallocMC/mallocMC.hpp | 59 + .../mallocMC/mallocMC_allocator_handle.hpp | 65 + include/mallocMC/mallocMC_constraints.hpp | 91 ++ include/mallocMC/mallocMC_hostclass.hpp | 33 + include/mallocMC/mallocMC_traits.hpp | 39 + include/mallocMC/mallocMC_utils.hpp | 216 +++ .../oOMPolicies/BadAllocException.hpp | 78 + include/mallocMC/oOMPolicies/ReturnNull.hpp | 61 + .../reservePoolPolicies/AlpakaBuf.hpp | 65 + .../reservePoolPolicies/CudaSetLimits.hpp | 85 + include/mallocMC/reservePoolPolicies/Noop.hpp | 60 + include/mallocMC/version.hpp | 48 + test/CMakeLists.txt | 19 + test/multithreaded/CMakeLists.txt | 72 + test/multithreaded/source/AccessBlock.cpp | 927 +++++++++++ test/multithreaded/source/BitField.cpp | 92 ++ test/multithreaded/source/Scatter.cpp | 859 ++++++++++ test/multithreaded/source/mocks.hpp | 76 + test/unit/CMakeLists.txt | 72 + test/unit/source/AccessBlock.cpp | 655 ++++++++ test/unit/source/Allocator.cpp | 62 + test/unit/source/BitField.cpp | 247 +++ test/unit/source/PageInterpretation.cpp | 316 ++++ test/unit/source/PageTable.cpp | 54 + test/unit/source/mocks.hpp | 76 + 66 files changed, 12715 insertions(+) create mode 100644 .clang-format create mode 100644 .clang-tidy create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 .yamllint create mode 100644 .zenodo.json create mode 100644 CHANGELOG.md create mode 100644 CMakeLists.txt create mode 100644 CONTRIBUTING.md create mode 100644 INSTALL.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 Usage.md create mode 100644 cmake/CPM_0.40.2.cmake create mode 100644 cmake/add_controlled.cmake create mode 100644 cmake/package-lock.cmake create mode 100644 cmake/tools.cmake create mode 100644 examples/CMakeLists.txt create mode 100644 examples/getAvailableSlots/CMakeLists.txt create mode 100644 examples/getAvailableSlots/source/main.cpp create mode 100644 examples/native-cuda/CMakeLists.txt create mode 100644 examples/native-cuda/source/main.cu create mode 100644 examples/vectorAdd/CMakeLists.txt create mode 100644 examples/vectorAdd/source/main.cpp create mode 100644 include/mallocMC/alignmentPolicies/Noop.hpp create mode 100644 include/mallocMC/alignmentPolicies/Shrink.hpp create mode 100644 include/mallocMC/allocator.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp create mode 100644 include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp create mode 100644 include/mallocMC/creationPolicies/GallatinCuda.hpp create mode 100644 include/mallocMC/creationPolicies/OldMalloc.hpp create mode 100644 include/mallocMC/creationPolicies/Scatter.hpp create mode 100644 include/mallocMC/device_allocator.hpp create mode 100644 include/mallocMC/distributionPolicies/Noop.hpp create mode 100644 include/mallocMC/distributionPolicies/XMallocSIMD.hpp create mode 100644 include/mallocMC/mallocMC.cuh create mode 100644 include/mallocMC/mallocMC.hpp create mode 100644 include/mallocMC/mallocMC_allocator_handle.hpp create mode 100644 include/mallocMC/mallocMC_constraints.hpp create mode 100644 include/mallocMC/mallocMC_hostclass.hpp create mode 100644 include/mallocMC/mallocMC_traits.hpp create mode 100644 include/mallocMC/mallocMC_utils.hpp create mode 100644 include/mallocMC/oOMPolicies/BadAllocException.hpp create mode 100644 include/mallocMC/oOMPolicies/ReturnNull.hpp create mode 100644 include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp create mode 100644 include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp create mode 100644 include/mallocMC/reservePoolPolicies/Noop.hpp create mode 100644 include/mallocMC/version.hpp create mode 100644 test/CMakeLists.txt create mode 100644 test/multithreaded/CMakeLists.txt create mode 100644 test/multithreaded/source/AccessBlock.cpp create mode 100644 test/multithreaded/source/BitField.cpp create mode 100644 test/multithreaded/source/Scatter.cpp create mode 100644 test/multithreaded/source/mocks.hpp create mode 100644 test/unit/CMakeLists.txt create mode 100644 test/unit/source/AccessBlock.cpp create mode 100644 test/unit/source/Allocator.cpp create mode 100644 test/unit/source/BitField.cpp create mode 100644 test/unit/source/PageInterpretation.cpp create mode 100644 test/unit/source/PageTable.cpp create mode 100644 test/unit/source/mocks.hpp diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..7249ac3a43 --- /dev/null +++ b/.clang-format @@ -0,0 +1,165 @@ +# General options +Language: Cpp +Standard: c++20 +DisableFormat: false +AccessModifierOffset: -4 +AlignAfterOpenBracket: AlwaysBreak +AlignArrayOfStructures: None +AlignConsecutiveAssignments: false +AlignConsecutiveBitFields: false +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Never +AllowAllArgumentsOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BitFieldColonSpacing: Both +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: All +BreakBeforeBraces: Allman +BreakBeforeConceptDeclarations: Always +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma +BreakStringLiterals: true +ColumnLimit: 119 +CommentPragmas: '^ COMMENT pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: Always +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: false +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: AfterHash +IndentRequiresClause: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: true +IntegerLiteralSeparator: + Binary: 4 + Decimal: 3 + DecimalMinDigits: 7 + Hex: 4 +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 2 +NamespaceIndentation: All +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 # default made explicit here +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 # default made explicit here +PenaltyReturnTypeOnItsOwnLine: 1000 +PointerAlignment: Left +PPIndentWidth: -1 # follow IndentWidth +QualifierAlignment: Custom +QualifierOrder: ['friend', 'static', 'inline', 'constexpr', 'type', 'const', 'volatile', 'restrict'] +ReferenceAlignment: Pointer # follow PointerAlignment +ReflowComments: true +RemoveBracesLLVM: false +RemoveSemicolon: false +RequiresClausePosition: WithPreceding +RequiresExpressionIndentation: OuterScope +ShortNamespaceLines: 0 +SortIncludes: true +SortUsingDeclarations: Lexicographic +SeparateDefinitionBlocks: Always +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +SpaceAroundPointerQualifiers: Default # follow PointerAlignment +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: Never +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +TabWidth: 4 +UseCRLF: false +UseTab: Never +# Project specific options +#AttributeMacros: [] +#ForEachMacros: [] +#IfMacros: [] +IncludeCategories: + # Local headers (in "") above all else + - Regex: '"([A-Za-z0-9.\/-_])+"' + Priority: 1 + # "alpaka/foo.hpp" after local headers (occur inside alpaka) + - Regex: '"alpaka/([A-Za-z0-9.\/-_])+"' + Priority: 2 + # after local headers (occur outside alpaka in examples and test) + - Regex: '' + Priority: 3 + # C++ standard library headers are the last group to be included + - Regex: '<([A-Za-z0-9\/-_])+>' + Priority: 5 + # Includes that made it this far are third-party headers and will be placed + # below alpaka's includes + - Regex: '<([A-Za-z0-9.\/-_])+>' + Priority: 4 +# Macros: [] +# NamespaceMacros: [] +StatementAttributeLikeMacros: + - 'ALPAKA_DEVICE_VOLATILE' + - 'ALPAKA_FN_ACC' + - 'ALPAKA_FN_EXTERN' + - 'ALPAKA_FN_HOST' + - 'ALPAKA_FN_HOST_ACC' + - 'ALPAKA_FN_INLINE' + - 'ALPAKA_STATIC_ACC_MEM_CONSTANT' + - 'ALPAKA_STATIC_ACC_MEM_GLOBAL' + - 'ALPAKA_UNROLL' + - 'ALPAKA_VECTORIZE_HINT' +#StatementMacros: [] +#TypenameMacros: [] +#WhitespaceSensitiveMacros: [] diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000000..4b599735bb --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,13 @@ +Checks: | + *, + -*-avoid-c-arrays, + -altera*, + -*avoid-do-while, + -*constant-array-index, + -*pointer*arithmetic*, + -llvmlibc*, + -llvm-header-guard, + -fuchsia*, + -misc-non-private-member-variables-in-classes, + -cppcoreguidelines-pro-type-reinterpret-cast +HeaderFilterRegex: ".*" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..afde91e781 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,46 @@ +name: Continuous Integration +on: [push, pull_request] +env: + CPM_SOURCE_CACHE: ${{ github.workspace }}/cpm_modules + CTEST_OUTPUT_ON_FAILURE: 1 + SKIP: no-commit-to-branch +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - uses: pre-commit/action@v3.0.1 + - uses: pre-commit-ci/lite-action@v1.0.2 + if: always() + cpu-tests: + # This action only runs on various CPU backends. + # As such, this is not a fully-fletched production-like test. + # Hopefully, it will still save us from a few stupid mistakes. + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v3 + with: + path: "**/cpm_modules" + key: ${{ github.workflow }}-cpm-modules-${{ hashFiles('**/CMakeLists.txt', '**/*.cmake') }} + - run: sudo apt update && sudo apt install libboost-all-dev + - run: | + cmake -S. -Bbuild \ + -Dalpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE:BOOL=ON \ + -DmallocMC_BUILD_TESTING:BOOL=ON \ + -DmallocMC_BUILD_EXAMPLES:BOOL=ON + - working-directory: build + run: cmake --build . -j + - working-directory: build + run: ctest + - working-directory: build + run: | + for example in examples/[^C]*/mallocMCExample*; do + ./${example} + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..00e52ae2fe --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# tmp files +*~ + +# Compiled Object files +*.slo +*.lo +*.o +/build + +# Compiled Dynamic libraries +*.so +*.dylib + +# Compiled Static libraries +*.lai +*.la +*.a + +# netbeans project files +/nbproject/ + +# Code::Blocks project files +/*.cbp +/*.layout + +# Visual Studio Code configuration files +.vscode +.vs + +# JetBrains project files +.idea/ + +# original backup files +*.orig + +.cache +compile_commands.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..881f6bb4d5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,40 @@ +minimum_pre_commit_version: 3.2.0 # necessitated by Lucas-C's hooks +default_install_hook_types: [pre-commit, pre-push] +repos: + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v19.1.7 + hooks: + - id: clang-format + files: \.(cpp|hpp) + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: no-commit-to-branch + args: [-b, dev] + - id: check-merge-conflict + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-toml + - id: check-yaml + args: ["--allow-multiple-documents"] + - id: mixed-line-ending + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.5 + hooks: + - id: forbid-tabs + - id: remove-tabs + - id: forbid-crlf + - id: remove-crlf + - repo: meta + hooks: + - id: check-useless-excludes + - repo: https://github.com/google/yamlfmt + rev: v0.15.0 + hooks: + - id: yamlfmt + - repo: https://github.com/adrienverge/yamllint + rev: v1.35.1 + hooks: + - id: yamllint diff --git a/.yamllint b/.yamllint new file mode 100644 index 0000000000..369e6ca9f4 --- /dev/null +++ b/.yamllint @@ -0,0 +1,6 @@ +extends: default +rules: + document-start: disable + truthy: disable + comments: disable + line-length: disable diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 0000000000..be594b1e0b --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,64 @@ +{ + "title": "mallocMC - Memory Allocator for Many Core Architectures", + "description": "This project provides a framework for fast memory managers on many core accelerators. It is based on alpaka to run on many different accelerators and implements multiple algorithms.", + "keywords": [ + "mallocMC", + "CUDA", + "manycore", + "GPU", + "allocator" + ], + "language": "eng", + "access_right": "open", + "license": { + "id": "MIT" + }, + "creators": [ + { + "name": "Widera, René", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0003-1642-0459" + }, + { + "name": "Lenz, Julian", + "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0001-5250-0005" + } + ], + "contributors": [ + { + "name": "Eckert, Carlchristian", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden", + "orcid": "0000-0002-6459-0842", + "type": "Other" + }, + { + "name": "Worpitz, Benjamin", + "type": "Other" + }, + { + "name": "Grund, Alexander", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0002-7196-8452", + "type": "Other" + }, + { + "name": "Huebl, Axel", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0003-1943-7141", + "type": "Other" + }, + { + "name": "Gruber, Bernhard Manfred", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, CASUS, CERN", + "orcid": "0000-0001-7848-1690", + "type": "Other" + }, + { + "name": "Bastrakov, Sergei", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0003-3396-6154", + "type": "Other" + } + ] +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000..fbfd5fd71f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,220 @@ +Change Log / Release Log for mallocMC +================================================================ + +2.5.0crp +-------- +**Date:** 2021-02-18 + +This release removes the native usage of CUDA by alpaka. +Attention: This release depends on an unreleased [alpaka 0.5.0dev](https://github.com/alpaka-group/alpaka/commit/34870a73ecf702069465aa030fbdf301c4d22c61) +version before the heavy alpaka namespace refactoring. + +### Changes to mallocMC 2.4.0crp + +**Features** +- Port to alpaka #173 + +**Bug fixes** +- fix HIP support (warpsize, activemask, compile issues) #182 +- fix data race and printf issue #189 +- fix data races in `Scatter.hpp` #190 +- fix clang cuda compile #192 + +**Misc:** +- Added alpaka subtree and switched to C++14 #176 +- Added 3rd party catch.hpp and made CMake find it #179 +- Update documentation after switch to alpaka #194 +- Update .clang-format and apply clang-format #197 + +Thanks to Bernhard Manfred Gruber and Rene Widera for contributing to this release! + +2.4.0crp +-------- +**Date:** 2020-05-28 + +This release removes the Boost dependency and switched to C++11. + +### Changes to mallocMC 2.3.1crp + +**Features** + - Cleaning, remove Boost dependency & C++11 Migration #169 + +**Bug fixes** + - Choose the value for the -arch nvcc flag depending on CUDA version #164 #165 + +**Misc:** + - Travis CI: GCC 5.5.0 + CUDA 9.1.85 #170 + - Adding headers to projects and applied clang-tidy #171 + - clang-format #172 + +Thanks to Sergei Bastrakov, Bernhard Manfred Gruber and Axel Huebl for contributing to this release! + +2.3.1crp +-------- +**Date:** 2019-02-14 + +A critical bug was fixed which can result in an illegal memory access. + +### Changes to mallocMC 2.3.0crp + +**Bug fixes** + - fix illegal memory access in `XMallocSIMD` #150 + +**Misc:** + - CMake: Honor `_ROOT` Env Hints #154 + + +2.3.0crp +-------- +**Date:** 2018-06-11 + +This release adds support for CUDA 9 and clang's -x cuda frontend and fixes several bugs. +Global objects have been refactored to separate objects on host and device. + +### Changes to mallocMC 2.2.0crp + +**Features** + - CUDA 9 support #144 #145 + - clang++ -x cuda support #133 + - add `destructiveResize` method #136 + - heap as separate object on host and device, no more globals #116 + - use `BOOST_STATIC_CONSTEXPR` where possible #109 + +**Bug fixes** + - fix uninitialized pointers #110 #112 + - fix crash in getAvailableSlots #106 #107 + - Fix `uint32_t` cstdint #104 #105 + - fix missing boost include #142 + - fix includes from C headers #121 + - fix missing local size change in `finalizeHeap()` #135 + - check heap pointer in Scatter creation policy #126 + +**Misc:** + - better link usage and install docs #141 + - self consistent allocator #140 + - rename some shadowed variables in C++11 mode #108 + - properly enforce `-Werror` in Travis-CI #128 + - update Travis-CI image #119 + - improved docs #125 #127 + +Thanks to Carlchristian Eckert, René Widera, Axel Huebl and Alexander Grund for contributing to this release! + + +2.2.0crp +------------- +**Date:** 2015-09-25 + +This release fixes some minor bugs that occured after the release of 2.1.0crp, adds some documentation and improves the interoperability with other projects and build systems. +We closed all issues documented in +[Milestone *2.2.0crp: Stabilizing the release*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=5&state=closed) + +### Changes to mallocMC 2.1.0crp + +**Features** + - the interface now provides the host function `HeapInfoVector getHeapLocations()` to obtain information about the location and size of existing mallocMC-heaps #86 + +**Bug fixes** + - the function `getAvailableSlots` was always required in the policy classes, although the implementations might not provide it #89 + +**Misc:** + - the code relied on `__TROW` being defined, which is not available in all compilers #91 + - the CMake dependency increased to CMake >= 2.8.12.2 #92 + - a new FindmallocMC.cmake module file is provided at https://github.com/ComputationalRadiationPhysics/cmake-modules #85 + - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.1.0crp...2.2.0crp + + +2.1.0crp +------------- +**Date:** 2015-02-11 + +This release fixes some bugs that occured after the release of 2.0.1crp and reduces the interface to improve interoperability with the default CUDA allocator. +We closed all issues documented in +[Milestone *New Features*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=3&state=closed) + +### Changes to mallocMC 2.0.1crp + +**Features** + - the possibility to overwrite the default implementation of new/delete and malloc/free was removed #72. **This changes the interface**, since users are now always forced to call `mallocMC::malloc()` and `mallocMC::free()`. This is intended to improve readability and allows to use the CUDA allocator inside mallocMC. + - the policy *Scatter* now places the onpagetables data structure at the end of a page. This can greatly improve performance when using large pages and `resetfreedpages=true` #80 + +**Bug fixes** + - in the policy *Scatter*, `fullsegments` and `additional_chunks` could grow too large in certain configurations #79 + +**Misc:** + - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.1crp...2.1.0crp + + +2.0.1crp +------------- +**Date:** 2015-01-13 + +This release fixes several bugs that occured after the release of 2.0.0crp. +We closed all issues documented in +[Milestone *Bugfixes*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=4&state=closed) + +### Changes to mallocMC 2.0.0crp + +**Bug fixes** + - page table metadata was not correctly initialized with 0 #70 + - freeing pages would not work under certain circumstances #66 + - the bitmask in a page table entry could be wrong due to a racecondition #62 + - not all regions were initialized correctly #60 + - getAvailableSlots could sometimes miss blocks #59 + - the counter for elements in a page could get too high due to a racecondition #61 + - Out of Memory (OOM) Policy sometimes did not recognize allocation failures correctly #67 + +**Misc:** + - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.0crp...2.0.1crp + + +2.0.0crp +------------- +**Date:** 2014-06-02 + +This release introduces mallocMC, which contains the previous algorithm and +much code from ScatterAlloc 1.0.2crp. The project was renamed due to massive +restructurization and because the code uses ScatterAlloc as a reference +algorithm, but can be extended to include other allocators in the future. +We closed all issues documented in +[Milestone *Get Lib ready for PIConGPU*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=2&state=closed) + +### Changes to ScatterAlloc 1.0.2crp + +**Features** + - completely split into policies #17 + - configuration through structs instead of macro #17 + - function `getAvailableSlots()` #5 + - selectable data alignment #14 + - function `finalizeHeap()` #11 + +**Bug fixes:** + - build warning for cmake #33 + +**Misc:** + - verification code and examples #35 + - install routines #4 + - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/1.0.2crp...2.0.0crp + + +1.0.2crp +------------- +**Date:** 2014-01-07 + +This is our first bug fix release. +We closed all issues documented in +[Milestone *Bug fixes*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=1&state=closed) + +### Changes to 1.0.1 + +**Features:** + - added travis-ci.org support for compile tests #7 + +**Bug fixes:** + - broken cmake/compile #1 + - g++ warnings #10 + - only N-1 access blocks used instead of N #2 + - 32bit bug: allocate more than 4GB #12 + +**Misc:** + See the full changes at + https://github.com/ComputationalRadiationPhysics/scatteralloc/compare/1.0.1...1.0.2crp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..ab140ce8b5 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,121 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +# ---- Project ---- + +project( + mallocMC + VERSION 3.0.0 + LANGUAGES CXX +) + +# ---- Include guards ---- + +if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR) + message( + FATAL_ERROR + "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there." + ) +endif() + +# ---- Options ---- + +option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF) +option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF) +if (mallocMC_BUILD_TESTING OR mallocMC_BUILD_EXAMPLES) + enable_testing() +endif() +if (mallocMC_BUILD_TESTING) + set(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE ON CACHE BOOL "" FORCE) +endif() + +# ---- Add dependencies via CPM ---- +# see https://github.com/TheLartians/CPM.cmake for more info + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/cmake/package-lock.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake) + +# PackageProject.cmake will be used to make our target installable +add_controlled("PackageProject.cmake" REQUIRED) +add_controlled("alpaka" REQUIRED) + + +# ---- Create library ---- + +# Note: for header-only libraries change all PUBLIC flags to INTERFACE and create an interface +add_library(${PROJECT_NAME} INTERFACE) +set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) + +if(alpaka_ACC_GPU_CUDA_ENABLE) + add_controlled("Gallatin") + + if (TARGET gallatin::gallatin) + set(mallocMC_HAS_Gallatin_AVAILABLE YES) + else() + set(mallocMC_HAS_Gallatin_AVAILABLE NO) + endif() + + # Gallatin needs some fairly recent compute capability from CUDA. + # CMake defaults to taking the oldest supported by the device + # (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) + # which can be too old. This leads to compilation errors along the lines of + # + # error: no instance of overloaded function "atomicCAS" matches the argument list + # argument types are: (unsigned short *, unsigned short, unsigned short) + # + # because this overload was only added later (apparently?). + + if ("${CMAKE_CUDA_ARCHITECTURES}" LESS 70) + message( + WARNING + "CUDA architecture detected is too old: ${CMAKE_CUDA_ARCHITECTURES}. " + "If the architecture set is too old, this can lead to compilation errors with Gallatin. " + "If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70." + ) + set(mallocMC_HAS_Gallatin_AVAILABLE NO) + endif() + + if (mallocMC_HAS_Gallatin_AVAILABLE) + target_link_libraries(${PROJECT_NAME} INTERFACE gallatin) + target_compile_definitions(${PROJECT_NAME} INTERFACE mallocMC_HAS_Gallatin_AVAILABLE) + endif() +endif() + +# being a cross-platform target, we enforce standards conformance on MSVC +target_compile_options(${PROJECT_NAME} INTERFACE "$<$:/permissive->") + +target_include_directories( + ${PROJECT_NAME} INTERFACE $ + $ +) +target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka) + + +if(mallocMC_BUILD_TESTING) + include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake) + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test ${CMAKE_BINARY_DIR}/test) +endif() + +if(mallocMC_BUILD_EXAMPLES) + include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake) + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples ${CMAKE_BINARY_DIR}/examples) +endif() + +# ---- Create an installable target ---- +# this allows users to install and find the library via `find_package()`. + +# the location where the project's version header will be placed should match the project's regular +# header paths +string(TOLOWER ${PROJECT_NAME}/version.hpp VERSION_HEADER_LOCATION) + +packageProject( + NAME ${PROJECT_NAME} + VERSION ${PROJECT_VERSION} + NAMESPACE ${PROJECT_NAME} + BINARY_DIR ${PROJECT_BINARY_DIR} + INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include + INCLUDE_DESTINATION include/${PROJECT_NAME}-${PROJECT_VERSION} + VERSION_HEADER "${VERSION_HEADER_LOCATION}" + COMPATIBILITY SameMajorVersion +) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..64e12b31af --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing + +## Formatting + +Please format your code before before opening pull requests using clang-format and the .clang-format file placed in the repository root. + +### Visual Studio and CLion +Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1. +The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey. + +### Bash +First install clang-format. Instructions therefore can be found on the web. +To format your changes since branching off `dev`, you can run this command in bash: +``` +git clang-format dev +``` +To format all code in your working copy, you can run this command in bash: +``` +find -iname *.cpp -o -iname *.hpp | xargs clang-format -i +``` diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000..4e06d82097 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,71 @@ +Install +------- +### Dependencies + - C++20 compiler (clang, gcc, hipcc, icc, nvcc) + - *Debian/Ubuntu:* `sudo apt-get install gcc build-essential` + - *Arch Linux:* `sudo pacman -S base-devel` + - `alpaka` 1.2.0 + - included as git submodule + - `boost` >= 1.65.1 + - dependency of alpaka + - *Debian/Ubuntu:* `sudo apt-get install libboost-dev libboost-program-options-dev` + - *Arch Linux:* `sudo pacman -S boost` + - or download from [http://www.boost.org/](http://sourceforge.net/projects/boost/files/boost/1.55.0/boost_1_55_0.tar.gz/download) + - `CMake` >= 3.15 + - *Debian/Ubuntu:* `sudo apt-get install cmake file cmake-curses-gui` + - *Arch Linux:* `sudo pacman -S cmake` + - `git` >= 1.7.9.5 + - *Debian/Ubuntu:* `sudo apt-get install git` + - *Arch Linux:* `sudo pacman -S git` + + +### Examples +This is an example how to compile `mallocMC` and test the example code snippets + +1. **Setup directories:** + - `mkdir -p build` +2. **Download the source code:** + - `git clone https://github.com/alpaka-group/mallocMC.git` +3. **Build** + - `cd build` + - `cmake ../mallocMC -DCMAKE_INSTALL_PREFIX=$HOME/libs` + - `make examples` + - `make install` (optional) +4. **Run the examples** + - `./mallocMC_Example01` + - `./mallocMC_Example02` + - `./VerifyHeap` + - additional options: see `./VerifyHeap --help` + + +Linking to your Project +----------------------- + +To use mallocMC in your project, you must include the header `mallocMC/mallocMC.hpp` and +add the correct include path. + +Because we are linking to Boost and CUDA, the following **external dependencies** must be linked: +- `-lboost` + +If you are using CMake you can download our `FindmallocMC.cmake` module with +```bash +wget https://raw.githubusercontent.com/ComputationalRadiationPhysics/cmake-modules/dev/FindmallocMC.cmake +# read the documentation +cmake -DCMAKE_MODULE_PATH=. --help-module FindmallocMC | less +``` + +and use the following lines in your `CMakeLists.txt`: +```cmake +# this example will require at least CMake 3.15 +CMAKE_MINIMUM_REQUIRED(VERSION 3.15) + +# add path to FindmallocMC.cmake, e.g., in the directory in cmake/ +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/) + +# find mallocMC installation +find_package(mallocMC 2.6.0 REQUIRED) + +alpaka_add_executable(yourBinary ${SOURCES}) +target_include_directories(yourBinary PUBLIC ${mallocMC_INCLUDE_DIRS}) +target_link_libraries(yourBinary PUBLIC alpaka::alpaka) +``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7c7870ae48 --- /dev/null +++ b/LICENSE @@ -0,0 +1,40 @@ +/* + mallocMC: Memory Allocation for Many Core Architectures + + based on the work of ScatterAlloc: + Massively Parallel Dynamic Memory Allocation for the GPU + + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Bernhard Kainz - kainz ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ diff --git a/README.md b/README.md new file mode 100644 index 0000000000..b99fa52e2d --- /dev/null +++ b/README.md @@ -0,0 +1,89 @@ +mallocMC +============= + +mallocMC: *Memory Allocator for Many Core Architectures* + +This project provides a framework for **fast memory managers** on **many core +accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka) +to run on many different accelerators and comes with multiple allocation +algorithms out-of-the-box. Custom ones can be added easily due to the +policy-based design. + +Usage +------- + +Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your +`new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket: + +Install +------- + +mallocMC is header-only, but requires a few other C++ libraries to be +available. Our installation notes can be found in [INSTALL.md](INSTALL.md). + +Contributing +------------ + +Rules for contributions are found in [CONTRIBUTING.md](./CONTRIBUTING.md). + +On the Algorithms +----------------------------- + +This library was originally inspired by the *ScatterAlloc* algorithm, +[forked](https://en.wikipedia.org/wiki/Fork_%28software_development%29) +from the **ScatterAlloc** project, developed by the +[Managed Volume Processing](http://www.icg.tugraz.at/project/mvp) +group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at), +TU Graz (kudos!). The currently shipped algorithms are using similar ideas but +differ from the original one significantly. + +From the original project page (which is no longer existent to the best of our +knowledge): + +```quote +ScatterAlloc is a dynamic memory allocator for the GPU. It is +designed concerning the requirements of massively parallel +execution. + +ScatterAlloc greatly reduces collisions and congestion by +scattering memory requests based on hashing. It can deal with +thousands of GPU-threads concurrently allocating memory and its +execution time is almost independent of the thread count. + +ScatterAlloc is open source and easy to use in your CUDA projects. +``` + +Our Homepage: + +Versions and Releases +--------------------- + +Official releases can be found in the +[Github releases](https://github.com/alpaka-group/mallocMC/releases). +We try to stick to [semantic versioning](https://semver.org/) but we'll bump +the major version number for major features. +Development happens on the `dev` branch. +Changes there have passed the CI and a code review but we make no guarantees +about API or feature stability in this branch. + +Literature +---------- + +Just an incomplete link collection for now: + +- [Paper](https://doi.org/10.1109/InPar.2012.6339604) by + Markus Steinberger, Michael Kenzel, Bernhard Kainz and Dieter Schmalstieg + +- 2012, May 5th: [Presentation](http://innovativeparallel.org/Presentations/inPar_kainz.pdf) + at *Innovative Parallel Computing 2012* by *Bernhard Kainz* + +- Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by + Carlchristian Eckert (2014) + +License +------- + +We distribute the modified software under the same license as the +original software from TU Graz (by using the +[MIT License](https://en.wikipedia.org/wiki/MIT_License)). +Please refer to the [LICENSE](LICENSE) file. diff --git a/Usage.md b/Usage.md new file mode 100644 index 0000000000..45963ee032 --- /dev/null +++ b/Usage.md @@ -0,0 +1,162 @@ +Usage +===== + +Step 1: include +--------------- + +There is one header file that will include *all* necessary files: + +```c++ +#include +``` + +Step 2a: choose policies +----------------------- + +Each instance of a policy based allocator is composed through 5 **policies**. +Each policy is expressed as a **policy class**. + +Currently, there are the following policy classes available: + +|Policy | Policy Classes (implementations) | description | +|------- |----------------------------------| ----------- | +|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| +| | FlatterScatter`` | Another scattered allocation algorithm similar in spirit to `Scatter` but with a flatter hierarchy and stronger concurrency invariants. `conf1` and `conf2` act as before. +| | OldMalloc | Device-side malloc/new and free/delete syscalls as implemented on the given device. +|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | +| | Noop | no workload distribution at all | +|**OOMPolicy** | ReturnNull | pointers will be *nullptr*, if the request could not be fulfilled | +| | ~~BadAllocException~~ | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions | +|**ReservePoolPolicy** | AlpakaBuf | Allocate a fixed-size buffer in an `alpaka`-provided container. | +| | CudaSetLimits | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) | +|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| +| | Noop | no alignment at all | + +The user has to choose one of each policy that will form a useful allocator +(see [here](Usage.md#2c-combine-policies)) + +Step 2b: configure policies +--------------------------- + +Some of those policies are templates that can be configured through a +configuration struct. The default struct can be accessed through +```PolicyNamespace::PolicyClass<>::Properties```, which allows to +inherit a struct to modify some of its parameters before passing it +to the policy class: + +```c++ +// configure the AlignmentPolicy "Shrink" +struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties { + static constexpr auto dataAlignment = 16; +}; +``` + +Step 2c: combine policies +------------------------- + +After configuring the chosen policies, they can be used as template +parameters to create the desired allocator type: + +```c++ +using namespace mallocMC; + +using Allocator1 = mallocMC::Allocator< + CreationPolicy::OldMalloc, + DistributionPolicy::Noop, + OOMPolicy::ReturnNull, + ReservePoolPolicy::CudaSetLimits, + AlignmentPolicy::Noop +>; +``` + +`Allocator1` will resemble the behaviour of classical device-side allocation known +from NVIDIA CUDA since compute capability sm_20. To get a more novel allocator, one +could create the following alias instead: + +```c++ +using namespace mallocMC; + +using ScatterAllocator = mallocMC::Allocator< + CreationPolicies::Scatter<>, + DistributionPolicies::XMallocSIMD<>, + OOMPolicies::ReturnNull, + ReservePoolPolicies::SimpleCudaMalloc, + AlignmentPolicies::Shrink +>; +``` + +Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated without +template arguments to use the default configuration. `Shrink` however uses the +configuration struct defined above. + +Step 3: instantiate allocator +----------------------------- + +To use the defined allocator type, create an instance with the desired heap size: + +```c++ +ScatterAllocator sa( 512U * 1024U * 1024U ); // heap size of 512MiB +``` + +The allocator object offers the following methods + +| Name | description | +|---------------------- |-------------------------| +| getAllocatorHandle() | Acquire a handle from the allocator that can be used in kernels to allocate memory on device. +| getAvailableSlots(size_t) | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | + +One should note that on a running system with multiple threads manipulating +memory the information provided by `getAvailableSlots` is stale the moment it's +acquired and so relying on this information to be accurate is not recommended. +It is supposed to be used in initialisation/finalisation phases without dynamic +memory allocations or in tests. + +Step 4: use dynamic memory allocation in a kernel +------------------------------------------------- + +A handle to the allocator object must be passed to each kernel. The handle type is defined as an internal type of the allocator. Inside the kernel, this handle can be used to request memory. + +The handle offers the following methods: + +| Name | description | +|---------------------- |-------------------------| +| malloc(size_t) | Allocates memory on the accelerator | +| free(size_t) | Frees memory on the accelerator | +| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`).| + +The comments on `getAvailableSlots` from above hold all the same. +A simplistic example would look like this: + +```c++ +#include + +namespace mallocMC = MC; + +using ScatterAllocator = MC::Allocator< + MC::CreationPolicies::Scatter<>, + MC::DistributionPolicies::XMallocSIMD<>, + MC::OOMPolicies::ReturnNull, + MC::ReservePoolPolicies::SimpleCudaMalloc, + MC::AlignmentPolicies::Shrink +>; + +__global__ exampleKernel(ScatterAllocator::AllocatorHandle sah) +{ + // some code ... + + int* a = (int*) sah.malloc(sizeof(int)*42); + + // some more code, using *a + + sah.free(a); +} + +int main(){ + ScatterAllocator sa( 1U * 512U * 1024U * 1024U ); // heap size of 512MiB + exampleKernel<<< 32, 32 >>>(sa); + + return 0; +} +``` + +For more usage examples, have a look at the [examples](examples). diff --git a/cmake/CPM_0.40.2.cmake b/cmake/CPM_0.40.2.cmake new file mode 100644 index 0000000000..51a07a57f5 --- /dev/null +++ b/cmake/CPM_0.40.2.cmake @@ -0,0 +1,1280 @@ +# CPM.cmake - CMake's missing package manager +# =========================================== +# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions. +# +# MIT License +# ----------- +#[[ + Copyright (c) 2019-2023 Lars Melchior and contributors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +]] + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +# Initialize logging prefix +if(NOT CPM_INDENT) + set(CPM_INDENT + "CPM:" + CACHE INTERNAL "" + ) +endif() + +if(NOT COMMAND cpm_message) + function(cpm_message) + message(${ARGV}) + endfunction() +endif() + +set(CURRENT_CPM_VERSION 0.40.2) + +get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH) +if(CPM_DIRECTORY) + if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY) + if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION) + message( + AUTHOR_WARNING + "${CPM_INDENT} \ +A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \ +It is recommended to upgrade CPM to the most recent version. \ +See https://github.com/cpm-cmake/CPM.cmake for more information." + ) + endif() + if(${CMAKE_VERSION} VERSION_LESS "3.17.0") + include(FetchContent) + endif() + return() + endif() + + get_property( + CPM_INITIALIZED GLOBAL "" + PROPERTY CPM_INITIALIZED + SET + ) + if(CPM_INITIALIZED) + return() + endif() +endif() + +if(CURRENT_CPM_VERSION MATCHES "development-version") + message( + WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \ +Please update to a recent release if possible. \ +See https://github.com/cpm-cmake/CPM.cmake for details." + ) +endif() + +set_property(GLOBAL PROPERTY CPM_INITIALIZED true) + +macro(cpm_set_policies) + # the policy allows us to change options without caching + cmake_policy(SET CMP0077 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + + # the policy allows us to change set(CACHE) without caching + if(POLICY CMP0126) + cmake_policy(SET CMP0126 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0126 NEW) + endif() + + # The policy uses the download time for timestamp, instead of the timestamp in the archive. This + # allows for proper rebuilds when a projects url changes + if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0135 NEW) + endif() + + # treat relative git repository paths as being relative to the parent project's remote + if(POLICY CMP0150) + cmake_policy(SET CMP0150 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0150 NEW) + endif() +endmacro() +cpm_set_policies() + +option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies" + $ENV{CPM_USE_LOCAL_PACKAGES} +) +option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies" + $ENV{CPM_LOCAL_PACKAGES_ONLY} +) +option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL}) +option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package" + $ENV{CPM_DONT_UPDATE_MODULE_PATH} +) +option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path" + $ENV{CPM_DONT_CREATE_PACKAGE_LOCK} +) +option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK + "Add all packages added through CPM.cmake to the package lock" + $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK} +) +option(CPM_USE_NAMED_CACHE_DIRECTORIES + "Use additional directory of package name in cache on the most nested level." + $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES} +) + +set(CPM_VERSION + ${CURRENT_CPM_VERSION} + CACHE INTERNAL "" +) +set(CPM_DIRECTORY + ${CPM_CURRENT_DIRECTORY} + CACHE INTERNAL "" +) +set(CPM_FILE + ${CMAKE_CURRENT_LIST_FILE} + CACHE INTERNAL "" +) +set(CPM_PACKAGES + "" + CACHE INTERNAL "" +) +set(CPM_DRY_RUN + OFF + CACHE INTERNAL "Don't download or configure dependencies (for testing)" +) + +if(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE}) +else() + set(CPM_SOURCE_CACHE_DEFAULT OFF) +endif() + +set(CPM_SOURCE_CACHE + ${CPM_SOURCE_CACHE_DEFAULT} + CACHE PATH "Directory to download CPM dependencies" +) + +if(NOT CPM_DONT_UPDATE_MODULE_PATH) + set(CPM_MODULE_PATH + "${CMAKE_BINARY_DIR}/CPM_modules" + CACHE INTERNAL "" + ) + # remove old modules + file(REMOVE_RECURSE ${CPM_MODULE_PATH}) + file(MAKE_DIRECTORY ${CPM_MODULE_PATH}) + # locally added CPM modules should override global packages + set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}") +endif() + +if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + set(CPM_PACKAGE_LOCK_FILE + "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake" + CACHE INTERNAL "" + ) + file(WRITE ${CPM_PACKAGE_LOCK_FILE} + "# CPM Package Lock\n# This file should be committed to version control\n\n" + ) +endif() + +include(FetchContent) + +# Try to infer package name from git repository uri (path or url) +function(cpm_package_name_from_git_uri URI RESULT) + if("${URI}" MATCHES "([^/:]+)/?.git/?$") + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + else() + unset(${RESULT} PARENT_SCOPE) + endif() +endfunction() + +# Try to infer package name and version from a url +function(cpm_package_name_and_ver_from_url url outName outVer) + if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)") + # We matched an archive + set(filename "${CMAKE_MATCH_1}") + + if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)") + # We matched - (ie foo-1.2.3) + set(${outName} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + set(${outVer} + "${CMAKE_MATCH_2}" + PARENT_SCOPE + ) + elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)") + # We couldn't find a name, but we found a version + # + # In many cases (which we don't handle here) the url would look something like + # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly + # distinguish the package name from the irrelevant bits. Moreover if we try to match the + # package name from the filename, we'd get bogus at best. + unset(${outName} PARENT_SCOPE) + set(${outVer} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + else() + # Boldly assume that the file name is the package name. + # + # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but + # such cases should be quite rare. No popular service does this... we think. + set(${outName} + "${filename}" + PARENT_SCOPE + ) + unset(${outVer} PARENT_SCOPE) + endif() + else() + # No ideas yet what to do with non-archives + unset(${outName} PARENT_SCOPE) + unset(${outVer} PARENT_SCOPE) + endif() +endfunction() + +function(cpm_find_package NAME VERSION) + string(REPLACE " " ";" EXTRA_ARGS "${ARGN}") + find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET) + if(${CPM_ARGS_NAME}_FOUND) + if(DEFINED ${CPM_ARGS_NAME}_VERSION) + set(VERSION ${${CPM_ARGS_NAME}_VERSION}) + endif() + cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}") + CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}") + set(CPM_PACKAGE_FOUND + YES + PARENT_SCOPE + ) + else() + set(CPM_PACKAGE_FOUND + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from +# finding the system library +function(cpm_create_module_file Name) + if(NOT CPM_DONT_UPDATE_MODULE_PATH) + # erase any previous modules + file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake + "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)" + ) + endif() +endfunction() + +# Find a package locally or fallback to CPMAddPackage +function(CPMFindPackage) + set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS) + + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN}) + + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + set(downloadPackage ${CPM_DOWNLOAD_ALL}) + if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME}) + set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + endif() + if(downloadPackage) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(NOT CPM_PACKAGE_FOUND) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + endif() + +endfunction() + +# checks if a package has been added before +function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION) + if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES) + CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION) + if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}") + message( + WARNING + "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})." + ) + endif() + cpm_get_fetch_properties(${CPM_ARGS_NAME}) + set(${CPM_ARGS_NAME}_ADDED NO) + set(CPM_PACKAGE_ALREADY_ADDED + YES + PARENT_SCOPE + ) + cpm_export_variables(${CPM_ARGS_NAME}) + else() + set(CPM_PACKAGE_ALREADY_ADDED + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of +# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted +# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3 +function(cpm_parse_add_package_single_arg arg outArgs) + # Look for a scheme + if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$") + string(TOLOWER "${CMAKE_MATCH_1}" scheme) + set(uri "${CMAKE_MATCH_2}") + + # Check for CPM-specific schemes + if(scheme STREQUAL "gh") + set(out "GITHUB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "gl") + set(out "GITLAB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "bb") + set(out "BITBUCKET_REPOSITORY;${uri}") + set(packageType "git") + # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine + # type + elseif(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Fall back to a URL + set(out "URL;${arg}") + set(packageType "archive") + + # We could also check for SVN since FetchContent supports it, but SVN is so rare these days. + # We just won't bother with the additional complexity it will induce in this function. SVN is + # done by multi-arg + endif() + else() + if(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Give up + message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'") + endif() + endif() + + # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs + # containing '@' can be used + string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}") + + # Parse the rest according to package type + if(packageType STREQUAL "git") + # For git repos we interpret #... as a tag or branch or commit hash + string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}") + elseif(packageType STREQUAL "archive") + # For archives we interpret #... as a URL hash. + string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}") + # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url + # should do this at a later point + else() + # We should never get here. This is an assertion and hitting it means there's a problem with the + # code above. A packageType was set, but not handled by this if-else. + message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'") + endif() + + set(${outArgs} + ${out} + PARENT_SCOPE + ) +endfunction() + +# Check that the working directory for a git repo is clean +function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean) + + find_package(Git REQUIRED) + + if(NOT GIT_EXECUTABLE) + # No git executable, assume directory is clean + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + # check for uncommitted changes + execute_process( + COMMAND ${GIT_EXECUTABLE} status --porcelain + RESULT_VARIABLE resultGitStatus + OUTPUT_VARIABLE repoStatus + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET + WORKING_DIRECTORY ${repoPath} + ) + if(resultGitStatus) + # not supposed to happen, assume clean anyway + message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed") + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + if(NOT "${repoStatus}" STREQUAL "") + set(${isClean} + FALSE + PARENT_SCOPE + ) + return() + endif() + + # check for committed changes + execute_process( + COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag} + RESULT_VARIABLE resultGitDiff + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET + WORKING_DIRECTORY ${repoPath} + ) + + if(${resultGitDiff} EQUAL 0) + set(${isClean} + TRUE + PARENT_SCOPE + ) + else() + set(${isClean} + FALSE + PARENT_SCOPE + ) + endif() + +endfunction() + +# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN +# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended +# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`. +function(cpm_add_patches) + # Return if no patch files are supplied. + if(NOT ARGN) + return() + endif() + + # Find the patch program. + find_program(PATCH_EXECUTABLE patch) + if(WIN32 AND NOT PATCH_EXECUTABLE) + # The Windows git executable is distributed with patch.exe. Find the path to the executable, if + # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe. + find_package(Git QUIET) + if(GIT_EXECUTABLE) + get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY) + get_filename_component(extra_search_path_1up ${extra_search_path} DIRECTORY) + get_filename_component(extra_search_path_2up ${extra_search_path_1up} DIRECTORY) + find_program( + PATCH_EXECUTABLE patch HINTS "${extra_search_path_1up}/usr/bin" + "${extra_search_path_2up}/usr/bin" + ) + endif() + endif() + if(NOT PATCH_EXECUTABLE) + message(FATAL_ERROR "Couldn't find `patch` executable to use with PATCHES keyword.") + endif() + + # Create a temporary + set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS}) + + # Ensure each file exists (or error out) and add it to the list. + set(first_item True) + foreach(PATCH_FILE ${ARGN}) + # Make sure the patch file exists, if we can't find it, try again in the current directory. + if(NOT EXISTS "${PATCH_FILE}") + if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'") + endif() + set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + endif() + + # Convert to absolute path for use with patch file command. + get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE) + + # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are + # preceded by "&&". + if(first_item) + set(first_item False) + list(APPEND temp_list "PATCH_COMMAND") + else() + list(APPEND temp_list "&&") + endif() + # Add the patch command to the list + list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}") + endforeach() + + # Move temp out into parent scope. + set(CPM_ARGS_UNPARSED_ARGUMENTS + ${temp_list} + PARENT_SCOPE + ) + +endfunction() + +# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload +# FetchContent calls. As these are internal cmake properties, this method should be used carefully +# and may need modification in future CMake versions. Source: +# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake #L1152 +function(cpm_override_fetchcontent contentName) + cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "") + if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}") + endif() + + string(TOLOWER ${contentName} contentNameLower) + set(prefix "_FetchContent_${contentNameLower}") + + set(propertyName "${prefix}_sourceDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}") + + set(propertyName "${prefix}_binaryDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}") + + set(propertyName "${prefix}_populated") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} TRUE) +endfunction() + +# Download and add a package from source +function(CPMAddPackage) + cpm_set_policies() + + list(LENGTH ARGN argnLength) + if(argnLength EQUAL 1) + cpm_parse_add_package_single_arg("${ARGN}" ARGN) + + # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM + set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;") + endif() + + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + CUSTOM_CACHE_KEY + ) + + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES) + + cmake_parse_arguments(CPM_ARGS "REQUIRED" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + + # Set default values for arguments + + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + if(CPM_ARGS_DOWNLOAD_ONLY) + set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY}) + else() + set(DOWNLOAD_ONLY NO) + endif() + + if(DEFINED CPM_ARGS_GITHUB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git") + endif() + + if(DEFINED CPM_ARGS_GIT_REPOSITORY) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY}) + if(NOT DEFINED CPM_ARGS_GIT_TAG) + set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION}) + endif() + + # If a name wasn't provided, try to infer it from the git repo + if(NOT DEFINED CPM_ARGS_NAME) + cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME) + endif() + endif() + + set(CPM_SKIP_FETCH FALSE) + + if(DEFINED CPM_ARGS_GIT_TAG) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG}) + # If GIT_SHALLOW is explicitly specified, honor the value. + if(DEFINED CPM_ARGS_GIT_SHALLOW) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW}) + endif() + endif() + + if(DEFINED CPM_ARGS_URL) + # If a name or version aren't provided, try to infer them from the URL + list(GET CPM_ARGS_URL 0 firstUrl) + cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl) + # If we fail to obtain name and version from the first URL, we could try other URLs if any. + # However multiple URLs are expected to be quite rare, so for now we won't bother. + + # If the caller provided their own name and version, they trump the inferred ones. + if(NOT DEFINED CPM_ARGS_NAME) + set(CPM_ARGS_NAME ${nameFromUrl}) + endif() + if(NOT DEFINED CPM_ARGS_VERSION) + set(CPM_ARGS_VERSION ${verFromUrl}) + endif() + + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}") + endif() + + # Check for required arguments + + if(NOT DEFINED CPM_ARGS_NAME) + message( + FATAL_ERROR + "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'" + ) + endif() + + # Check if package has been added before + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + if(CPM_PACKAGE_ALREADY_ADDED) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + # Check for manual overrides + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "") + set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE}) + set(CPM_${CPM_ARGS_NAME}_SOURCE "") + CPMAddPackage( + NAME "${CPM_ARGS_NAME}" + SOURCE_DIR "${PACKAGE_SOURCE}" + EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}" + SYSTEM "${CPM_ARGS_SYSTEM}" + PATCHES "${CPM_ARGS_PATCHES}" + OPTIONS "${CPM_ARGS_OPTIONS}" + SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}" + DOWNLOAD_ONLY "${DOWNLOAD_ONLY}" + FORCE True + ) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + + # Check for available declaration + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "") + set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}}) + set(CPM_DECLARATION_${CPM_ARGS_NAME} "") + if (CPM_ARGS_REQUIRED) + CPMAddPackage(${declaration} REQUIRED ${CPM_ARGS_REQUIRED}) + else() + CPMAddPackage(${declaration}) + endif() + cpm_export_variables(${CPM_ARGS_NAME}) + # checking again to ensure version and option compatibility + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + return() + endif() + + if(NOT CPM_ARGS_FORCE) + if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY) + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(CPM_PACKAGE_FOUND) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + if(CPM_LOCAL_PACKAGES_ONLY) + if (NOT CPM_ARGS_REQUIRED) + message( + "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})" + ) + return() + endif() + message( + SEND_ERROR + "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})" + ) + endif() + endif() + endif() + + CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}") + + if(DEFINED CPM_ARGS_GIT_TAG) + set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}") + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}") + else() + set(PACKAGE_INFO "${CPM_ARGS_VERSION}") + endif() + + if(DEFINED FETCHCONTENT_BASE_DIR) + # respect user's FETCHCONTENT_BASE_DIR if set + set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR}) + else() + set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps) + endif() + + cpm_add_patches(${CPM_ARGS_PATCHES}) + + if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND}) + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR}) + if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR}) + # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work + # for relative paths. + get_filename_component( + source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR} + ) + else() + set(source_directory ${CPM_ARGS_SOURCE_DIR}) + endif() + if(NOT EXISTS ${source_directory}) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild") + endif() + elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS}) + list(SORT origin_parameters) + if(CPM_ARGS_CUSTOM_CACHE_KEY) + # Application set a custom unique directory name + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY}) + elseif(CPM_USE_NAMED_CACHE_DIRECTORIES) + string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG") + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME}) + else() + string(SHA1 origin_hash "${origin_parameters}") + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}) + endif() + # Expand `download_directory` relative path. This is important because EXISTS doesn't work for + # relative paths. + get_filename_component(download_directory ${download_directory} ABSOLUTE) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory}) + + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock) + endif() + + if(EXISTS ${download_directory}) + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} "${download_directory}" + "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + ) + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + + if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS)) + # warn if cache has been changed since checkout + cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN) + if(NOT ${IS_CLEAN}) + message( + WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty" + ) + endif() + endif() + + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}") + + # As the source dir is already cached/populated, we override the call to FetchContent. + set(CPM_SKIP_FETCH TRUE) + cpm_override_fetchcontent( + "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}" + ) + + else() + # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but + # it should guarantee no commit hash get mis-detected. + if(NOT DEFINED CPM_ARGS_GIT_SHALLOW) + cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH) + if(NOT ${IS_HASH}) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE) + endif() + endif() + + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild) + set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}") + endif() + endif() + + cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")") + + if(CPM_PACKAGE_LOCK_ENABLED) + if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK) + cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + elseif(CPM_ARGS_SOURCE_DIR) + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory") + else() + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + endif() + endif() + + cpm_message( + STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})" + ) + + if(NOT CPM_SKIP_FETCH) + # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare. + # Calling FetchContent_MakeAvailable will then internally forward these options to + # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and + # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30. + set(fetchContentDeclareExtraArgs "") + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0") + if(${CPM_ARGS_EXCLUDE_FROM_ALL}) + list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL) + endif() + if(${CPM_ARGS_SYSTEM}) + list(APPEND fetchContentDeclareExtraArgs SYSTEM) + endif() + if(DEFINED CPM_ARGS_SOURCE_SUBDIR) + list(APPEND fetchContentDeclareExtraArgs SOURCE_SUBDIR ${CPM_ARGS_SOURCE_SUBDIR}) + endif() + # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory + if(CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY) + foreach(OPTION ${CPM_ARGS_OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + endif() + cpm_declare_fetch( + "${CPM_ARGS_NAME}" ${fetchContentDeclareExtraArgs} "${CPM_ARGS_UNPARSED_ARGUMENTS}" + ) + + cpm_fetch_package("${CPM_ARGS_NAME}" ${DOWNLOAD_ONLY} populated ${CPM_ARGS_UNPARSED_ARGUMENTS}) + if(CPM_SOURCE_CACHE AND download_directory) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + if(${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.28.0") + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + endif() + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + endif() + + set(${CPM_ARGS_NAME}_ADDED YES) + cpm_export_variables("${CPM_ARGS_NAME}") +endfunction() + +# Fetch a previously declared package +macro(CPMGetPackage Name) + if(DEFINED "CPM_DECLARATION_${Name}") + CPMAddPackage(NAME ${Name}) + else() + message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available") + endif() +endmacro() + +# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set +macro(cpm_export_variables name) + set(${name}_SOURCE_DIR + "${${name}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${name}_BINARY_DIR + "${${name}_BINARY_DIR}" + PARENT_SCOPE + ) + set(${name}_ADDED + "${${name}_ADDED}" + PARENT_SCOPE + ) + set(CPM_LAST_PACKAGE_NAME + "${name}" + PARENT_SCOPE + ) +endmacro() + +# declares a package, so that any call to CPMAddPackage for the package name will use these +# arguments instead. Previous declarations will not be overridden. +macro(CPMDeclarePackage Name) + if(NOT DEFINED "CPM_DECLARATION_${Name}") + set("CPM_DECLARATION_${Name}" "${ARGN}") + endif() +endmacro() + +function(cpm_add_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n") + endif() +endfunction() + +function(cpm_add_comment_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} + "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n" + ) + endif() +endfunction() + +# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to +# update it +macro(CPMUsePackageLock file) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE) + if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + endif() + if(NOT TARGET cpm-update-package-lock) + add_custom_target( + cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE} + ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH} + ) + endif() + set(CPM_PACKAGE_LOCK_ENABLED true) + endif() +endmacro() + +# registers a package that has been added to CPM +function(CPMRegisterPackage PACKAGE VERSION) + list(APPEND CPM_PACKAGES ${PACKAGE}) + set(CPM_PACKAGES + ${CPM_PACKAGES} + CACHE INTERNAL "" + ) + set("CPM_PACKAGE_${PACKAGE}_VERSION" + ${VERSION} + CACHE INTERNAL "" + ) +endfunction() + +# retrieve the current version of the package to ${OUTPUT} +function(CPMGetPackageVersion PACKAGE OUTPUT) + set(${OUTPUT} + "${CPM_PACKAGE_${PACKAGE}_VERSION}" + PARENT_SCOPE + ) +endfunction() + +# declares a package in FetchContent_Declare +function(cpm_declare_fetch PACKAGE) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)") + return() + endif() + + FetchContent_Declare(${PACKAGE} ${ARGN}) +endfunction() + +# returns properties for a package previously defined by cpm_declare_fetch +function(cpm_get_fetch_properties PACKAGE) + if(${CPM_DRY_RUN}) + return() + endif() + + set(${PACKAGE}_SOURCE_DIR + "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}" + PARENT_SCOPE + ) +endfunction() + +function(cpm_store_fetch_properties PACKAGE source_dir binary_dir) + if(${CPM_DRY_RUN}) + return() + endif() + + set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR + "${source_dir}" + CACHE INTERNAL "" + ) + set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR + "${binary_dir}" + CACHE INTERNAL "" + ) +endfunction() + +# adds a package as a subdirectory if viable, according to provided options +function( + cpm_add_subdirectory + PACKAGE + DOWNLOAD_ONLY + SOURCE_DIR + BINARY_DIR + EXCLUDE + SYSTEM + OPTIONS +) + + if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt) + set(addSubdirectoryExtraArgs "") + if(EXCLUDE) + list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL) + endif() + if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25") + # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html #prop_dir:SYSTEM + list(APPEND addSubdirectoryExtraArgs SYSTEM) + endif() + if(OPTIONS) + foreach(OPTION ${OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + set(CPM_OLD_INDENT "${CPM_INDENT}") + set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:") + add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs}) + set(CPM_INDENT "${CPM_OLD_INDENT}") + endif() +endfunction() + +# downloads a previously declared package via FetchContent and exports the variables +# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope +function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated) + set(${populated} + FALSE + PARENT_SCOPE + ) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)") + return() + endif() + + FetchContent_GetProperties(${PACKAGE}) + + string(TOLOWER "${PACKAGE}" lower_case_name) + + if(NOT ${lower_case_name}_POPULATED) + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0") + if(DOWNLOAD_ONLY) + # MakeAvailable will call add_subdirectory internally which is not what we want when + # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the + # build + FetchContent_Populate( + ${PACKAGE} + SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src" + BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + SUBBUILD_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild" + ${ARGN} + ) + else() + FetchContent_MakeAvailable(${PACKAGE}) + endif() + else() + FetchContent_Populate(${PACKAGE}) + endif() + set(${populated} + TRUE + PARENT_SCOPE + ) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR} + ) + + set(${PACKAGE}_SOURCE_DIR + ${${lower_case_name}_SOURCE_DIR} + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + ${${lower_case_name}_BINARY_DIR} + PARENT_SCOPE + ) +endfunction() + +# splits a package option +function(cpm_parse_option OPTION) + string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}") + string(LENGTH "${OPTION}" OPTION_LENGTH) + string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH) + if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH) + # no value for key provided, assume user wants to set option to "ON" + set(OPTION_VALUE "ON") + else() + math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1") + string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE) + endif() + set(OPTION_KEY + "${OPTION_KEY}" + PARENT_SCOPE + ) + set(OPTION_VALUE + "${OPTION_VALUE}" + PARENT_SCOPE + ) +endfunction() + +# guesses the package version from a git tag +function(cpm_get_version_from_git_tag GIT_TAG RESULT) + string(LENGTH ${GIT_TAG} length) + if(length EQUAL 40) + # GIT_TAG is probably a git hash + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG}) + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + endif() +endfunction() + +# guesses if the git tag is a commit hash or an actual tag or a branch name. +function(cpm_is_git_tag_commit_hash GIT_TAG RESULT) + string(LENGTH "${GIT_TAG}" length) + # full hash has 40 characters, and short hash has at least 7 characters. + if(length LESS 7 OR length GREATER 40) + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$") + set(${RESULT} + 1 + PARENT_SCOPE + ) + else() + set(${RESULT} + 0 + PARENT_SCOPE + ) + endif() + endif() +endfunction() + +function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT) + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + ) + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND) + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + foreach(oneArgName ${oneValueArgs}) + if(DEFINED CPM_ARGS_${oneArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + if(${oneArgName} STREQUAL "SOURCE_DIR") + string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName} + ${CPM_ARGS_${oneArgName}} + ) + endif() + string(APPEND PRETTY_OUT_VAR " ${oneArgName} ${CPM_ARGS_${oneArgName}}\n") + endif() + endforeach() + foreach(multiArgName ${multiValueArgs}) + if(DEFINED CPM_ARGS_${multiArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ${multiArgName}\n") + foreach(singleOption ${CPM_ARGS_${multiArgName}}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " \"${singleOption}\"\n") + endforeach() + endif() + endforeach() + + if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "") + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ") + foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS}) + string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}") + endforeach() + string(APPEND PRETTY_OUT_VAR "\n") + endif() + + set(${OUT_VAR} + ${PRETTY_OUT_VAR} + PARENT_SCOPE + ) + +endfunction() diff --git a/cmake/add_controlled.cmake b/cmake/add_controlled.cmake new file mode 100644 index 0000000000..ffbfde5ba0 --- /dev/null +++ b/cmake/add_controlled.cmake @@ -0,0 +1,75 @@ +cmake_minimum_required(VERSION 3.20) +include(${CMAKE_CURRENT_LIST_DIR}/../cmake/CPM_0.40.2.cmake) + +# Adds a controlled dependency to the project. +# Arguments: +# NAME -- The name of the dependency. +# Optional Arguments: +# REQUIRED -- A boolean switch indicating if the dependency is required. Default is OFF. +# PREFIX -- The prefix for the variable name. Default is the project name. +# Example: +# in CMakeLists.txt: add_controlled(NAME my_dependency REQUIRED ON PREFIX my_project) +# during build: cmake -Dmy_project_USE_my_dependency=ON_ALWAYS_FETCH +function(add_controlled NAME) + # Parse arguments + cmake_parse_arguments(ADD_CONTROLLED + "REQUIRED" # Boolean options + "PREFIX" # Single-value options + "" # Multi-value options + ${ARGN} + ) + + # Set default values if not provided + if(NOT ADD_CONTROLLED_PREFIX) + set(ADD_CONTROLLED_PREFIX ${PROJECT_NAME}) + endif() + + if (TARGET ${NAME} OR TARGET ${NAME}::${NAME}) + message("There already exists a target for dependency ${NAME}. Not addding ${NAME} for ${ADD_CONTROLLED_PREFIX} again.") + return() + endif() + + + set(ALL_OPTIONS "ON;ON_ALLOW_FETCH;ON_ALWAYS_FETCH;AUTO;OFF") + if(ADD_CONTROLLED_REQUIRED) + # This is a required dependency, so we're only free to choose how, not if, we want to use it. + set(AVAILABLE_OPTIONS "ON;ON_ALLOW_FETCH;ON_ALWAYS_FETCH") + set(${ADD_CONTROLLED_PREFIX}_USE_${NAME} "ON_ALLOW_FETCH" CACHE STRING "") + else() + set(AVAILABLE_OPTIONS ${ALL_OPTIONS}) + set(${ADD_CONTROLLED_PREFIX}_USE_${NAME} "AUTO" CACHE STRING "") + endif() + + if(NOT ${ADD_CONTROLLED_PREFIX}_USE_${NAME} IN_LIST ALL_OPTIONS) + if(EXISTS ${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}) + set(CPM_${NAME}_SOURCE ${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}) + else() + message(FATAL_ERROR "You must choose one of ${AVAILABLE_OPTIONS} for ${ADD_CONTROLLED_PREFIX}_USE_${NAME} or a valid path. You've given ${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}.") + endif() + elseif(NOT ${ADD_CONTROLLED_PREFIX}_USE_${NAME} IN_LIST AVAILABLE_OPTIONS) + message(FATAL_ERROR "You must choose one of ${AVAILABLE_OPTIONS} for ${ADD_CONTROLLED_PREFIX}_USE_${NAME} or a valid path. You've given ${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}.") + endif() + + if (${ADD_CONTROLLED_PREFIX}_USE_${NAME} STREQUAL "OFF") + return() + endif() + + # Our default for ON and AUTO: + set(CPM_USE_LOCAL_PACKAGES ON) + set(CPM_LOCAL_PACKAGES_ONLY ON) + + if ("${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}" MATCHES "ON_ALLOW_FETCH") + set(CPM_USE_LOCAL_PACKAGES ON) + set(CPM_LOCAL_PACKAGES_ONLY OFF) + elseif ("${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}" MATCHES "ON_ALWAYS_FETCH") + set(CPM_USE_LOCAL_PACKAGES OFF) + set(CPM_LOCAL_PACKAGES_ONLY OFF) + endif() + + # all the details about version, url, etc. are given in cmake/package-lock.cmake + if ("${${ADD_CONTROLLED_PREFIX}_USE_${NAME}}" MATCHES "^ON") + CPMAddPackage(NAME ${NAME} REQUIRED) + else() + CPMAddPackage(NAME ${NAME}) + endif() +endfunction() diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake new file mode 100644 index 0000000000..cb68bb05b2 --- /dev/null +++ b/cmake/package-lock.cmake @@ -0,0 +1,47 @@ +# CPM Package Lock +# This file should be committed to version control + +# PackageProject.cmake +CPMDeclarePackage(PackageProject.cmake + VERSION 1.8.0 + GITHUB_REPOSITORY TheLartians/PackageProject.cmake + SYSTEM YES + EXCLUDE_FROM_ALL YES +) +# alpaka +CPMDeclarePackage(alpaka + NAME alpaka + # This is a development version slightly after 1.2.0 because we needed a patch + GIT_TAG 95c0bf2397255a89467bb5c151a96367ad1d1f93 + GITHUB_REPOSITORY alpaka-group/alpaka + OPTIONS + "alpaka_CXX_STANDARD 20;alpaka_INSTALL ON" + # It is recommended to let CPM cache dependencies in order to reduce redundant downloads. + # However, we might in the foreseeable future turn to unstable references like the `dev` branch here. + # Setting the following option tells CPM to not use the cache. + # This is particularly important for CI! + # NO_CACHE TRUE +) +# cmake-scripts +CPMDeclarePackage(cmake-scripts + GIT_TAG 24.04 + GITHUB_REPOSITORY StableCoder/cmake-scripts + SYSTEM YES + EXCLUDE_FROM_ALL YES +) +# Catch2 +CPMDeclarePackage(Catch2 + VERSION 3.7.0 + GITHUB_REPOSITORY catchorg/Catch2 + SYSTEM YES + EXCLUDE_FROM_ALL YES +) +# Gallatin +CPMDeclarePackage(Gallatin + # There's no release available yet. + GIT_TAG ac0cb8e380ffcb74156bafb8805fb60412817c5f + # Use our own fork for some patches + GITHUB_REPOSITORY chillenzer/Gallatin + SYSTEM YES + EXCLUDE_FROM_ALL YES +) diff --git a/cmake/tools.cmake b/cmake/tools.cmake new file mode 100644 index 0000000000..c74b641a4e --- /dev/null +++ b/cmake/tools.cmake @@ -0,0 +1,74 @@ +# this file contains a list of tools that can be activated and downloaded on-demand each tool is +# enabled during configuration by passing an additional `-DUSE_=` argument to CMake + +# only activate tools for top level project +if(NOT PROJECT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + return() +endif() + +include(${CMAKE_CURRENT_LIST_DIR}/CPM_0.40.2.cmake) + +# enables sanitizers support using the the `USE_SANITIZER` flag available values are: Address, +# Memory, MemoryWithOrigins, Undefined, Thread, Leak, 'Address;Undefined' +if(mallocMC_USE_SANITIZER OR mallocMC_USE_STATIC_ANALYZER) + CPMAddPackage("gh:StableCoder/cmake-scripts#24.04") + + if(mallocMC_USE_SANITIZER) + include(${cmake-scripts_SOURCE_DIR}/sanitizers.cmake) + endif() + + if(mallocMC_USE_STATIC_ANALYZER) + if("clang-tidy" IN_LIST mallocMC_USE_STATIC_ANALYZER) + set(CLANG_TIDY + ON + CACHE INTERNAL "" + ) + else() + set(CLANG_TIDY + OFF + CACHE INTERNAL "" + ) + endif() + if("iwyu" IN_LIST mallocMC_USE_STATIC_ANALYZER) + set(IWYU + ON + CACHE INTERNAL "" + ) + else() + set(IWYU + OFF + CACHE INTERNAL "" + ) + endif() + if("cppcheck" IN_LIST mallocMC_USE_STATIC_ANALYZER) + set(CPPCHECK + ON + CACHE INTERNAL "" + ) + else() + set(CPPCHECK + OFF + CACHE INTERNAL "" + ) + endif() + + include(${cmake-scripts_SOURCE_DIR}/tools.cmake) + + if(${CLANG_TIDY}) + clang_tidy(${CLANG_TIDY_ARGS}) + endif() + + if(${IWYU}) + include_what_you_use(${IWYU_ARGS}) + endif() + + if(${CPPCHECK}) + cppcheck(${CPPCHECK_ARGS}) + endif() + endif() +endif() + +# enables CCACHE support through the USE_CCACHE flag possible values are: YES, NO or equivalent +if(mallocMC_USE_CCACHE) + CPMAddPackage("gh:TheLartians/Ccache.cmake@1.2.4") +endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000000..560c85586b --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(Examples LANGUAGES CXX) + +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/vectorAdd + ${CMAKE_BINARY_DIR}/examples/vectorAdd +) + +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/getAvailableSlots + ${CMAKE_BINARY_DIR}/examples/getAvailableSlots +) + +check_language(CUDA) +if (CMAKE_CUDA_COMPILER AND alpaka_ACC_GPU_CUDA_ENABLE) + add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/native-cuda + ${CMAKE_BINARY_DIR}/examples/native-cuda + ) + + add_custom_target( + mallocMCExamples + DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda + COMMENT "Shortcut for building all examples." + ) +else() + add_custom_target( + mallocMCExamples + DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots + COMMENT "Shortcut for building all examples." + ) +endif() diff --git a/examples/getAvailableSlots/CMakeLists.txt b/examples/getAvailableSlots/CMakeLists.txt new file mode 100644 index 0000000000..aa2a74c0a5 --- /dev/null +++ b/examples/getAvailableSlots/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(mallocMCExampleGetAvailableSlots LANGUAGES CXX) + +# --- Import tools ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + +# ---- Dependencies ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/add_controlled.cmake) + +add_controlled("alpaka" REQUIRED PREFIX mallocMC) + +if(NOT TARGET mallocMC) + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) +endif() + +# ---- Create standalone executable ---- + +alpaka_add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cpp) + +set_target_properties(${PROJECT_NAME} + PROPERTIES + CXX_STANDARD 20 + OUTPUT_NAME ${PROJECT_NAME} + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka) +add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME}) diff --git a/examples/getAvailableSlots/source/main.cpp b/examples/getAvailableSlots/source/main.cpp new file mode 100644 index 0000000000..cc5e2531a2 --- /dev/null +++ b/examples/getAvailableSlots/source/main.cpp @@ -0,0 +1,154 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/OldMalloc.hpp" + +#include +#include + +#include + +#include +#include + +using mallocMC::CreationPolicies::FlatterScatter; +using mallocMC::CreationPolicies::OldMalloc; +using mallocMC::CreationPolicies::Scatter; + +using Dim = alpaka::DimInt<1>; +using Idx = std::size_t; + +// Define the device accelerator +using Acc = alpaka::ExampleDefaultAcc; + +constexpr uint32_t const blocksize = 2U * 1024U * 1024U; +constexpr uint32_t const pagesize = 4U * 1024U; +constexpr uint32_t const wasteFactor = 1U; + +// This happens to also work for the original Scatter algorithm, so we only define one. +struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig +{ + static constexpr auto accessblocksize = blocksize; + static constexpr auto pagesize = ::pagesize; + static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; + // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = wasteFactor; +}; + +struct AlignmentConfig +{ + static constexpr auto dataAlignment = 16; +}; + +ALPAKA_STATIC_ACC_MEM_GLOBAL int* arA = nullptr; + +template +struct ExampleKernel +{ + ALPAKA_FN_ACC void operator()(Acc const& acc, T_Allocator::AllocatorHandle allocHandle) const + { + auto const id = static_cast(alpaka::getIdx(acc)[0]); + if(id == 0) + { + arA = static_cast(allocHandle.malloc(acc, sizeof(int) * 32U)); + } + // wait the the malloc from thread zero is not changing the result for some threads + alpaka::syncBlockThreads(acc); + auto const slots = allocHandle.getAvailableSlots(acc, 1); + if(arA != nullptr) + { + arA[id] = id; + printf("id: %u array: %d slots %u\n", id, arA[id], slots); + } + else + printf("error: device size allocation failed"); + + // wait that all thread read from `arA` + alpaka::syncBlockThreads(acc); + if(id == 0) + { + allocHandle.free(acc, arA); + } + } +}; + +template< + typename T_CreationPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink> +auto example03() -> int +{ + using Allocator = mallocMC::Allocator< + alpaka::AccToTag, + T_CreationPolicy, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + auto const platform = alpaka::Platform{}; + auto const dev = alpaka::getDevByIdx(platform, 0); + auto queue = alpaka::Queue{dev}; + auto const devProps = alpaka::getAccDevProps(dev); + unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); + + Allocator scatterAlloc(dev, queue, 2U * 1024U * 1024U * 1024U); // 2GB for device-side malloc + + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{block}, Idx{1}}; + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle())); + + std::cout << "Slots from Host: " << scatterAlloc.getAvailableSlots(dev, queue, 1) << '\n'; + + return 0; +} + +auto main(int /*argc*/, char* /*argv*/[]) -> int +{ + example03, mallocMC::ReservePoolPolicies::AlpakaBuf>(); + example03, mallocMC::ReservePoolPolicies::AlpakaBuf>(); +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +# ifdef mallocMC_HAS_Gallatin_AVAILABLE + example03< + mallocMC::CreationPolicies::GallatinCuda<>, + mallocMC::ReservePoolPolicies::Noop, + mallocMC::AlignmentPolicies::Noop>(); + // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time. + example03(); +# else + // This should normally be: + example03(); +# endif +#else + example03(); +#endif + return 0; +} diff --git a/examples/native-cuda/CMakeLists.txt b/examples/native-cuda/CMakeLists.txt new file mode 100644 index 0000000000..f7acefe669 --- /dev/null +++ b/examples/native-cuda/CMakeLists.txt @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA) + +# --- Import tools ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + +# ---- Dependencies ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + +if(NOT TARGET mallocMC) + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) +endif() + +# ---- Create standalone executable ---- + +add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu) + +set_target_properties(${PROJECT_NAME} + PROPERTIES + CXX_STANDARD 20 + OUTPUT_NAME ${PROJECT_NAME} + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + ) + +target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES}) +add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME}) diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu new file mode 100644 index 0000000000..00c429a95c --- /dev/null +++ b/examples/native-cuda/source/main.cu @@ -0,0 +1,104 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2025 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include + +/** + * @brief Computes the sum of squares of the first `n` natural numbers. + * + * This function calculates the sum of squares of the first `n` natural numbers using the formula: + * \[ + * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6} + * \] + * It's used to check the computed value in the kernel. + * + * @param n The number of natural numbers to consider. + * @return The sum of squares of the first `n` natural numbers. + */ +__device__ auto sumOfSquares(auto const n) +{ + return (n * (n + 1) * (2 * n + 1)) / 6; +} + +/** + * @brief Computes the dot product of two vectors for each thread. + * + * This kernel computes the dot product of two vectors, `a` and `b`, for each thread. + * Each thread allocates memory for its own vectors, initializes them with consecutive values, + * computes the dot product, and checks if the result matches the expected value. + * If the result does not match, the thread prints an error message and halts execution. + * + * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation. + * @param numValues The number of elements in each vector. + * + * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a + * native CUDA application. + */ +__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues) +{ + uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // Not very realistic, all threads are doing this on their own: + auto a = std::span( + reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), + numValues); + auto b = std::span( + reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), + numValues); + + std::iota(std::begin(a), std::end(a), tid); + std::iota(std::begin(b), std::end(b), tid); + + uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U); + + auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0); + if(result != expected) + { + printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected); + __trap(); + } + + memoryManager.free(a.data()); + memoryManager.free(b.data()); +} + +int main() +{ + size_t const heapSize = 1024U * 1024U * 1024U; + uint64_t const numValues = 32U; + mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize}; + auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure}; + + std::cout << "Running native CUDA kernel." << std::endl; + oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues); +} diff --git a/examples/vectorAdd/CMakeLists.txt b/examples/vectorAdd/CMakeLists.txt new file mode 100644 index 0000000000..11421048c0 --- /dev/null +++ b/examples/vectorAdd/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(mallocMCExampleVectorAdd LANGUAGES CXX) + +# --- Import tools ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + +# ---- Dependencies ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/add_controlled.cmake) + +add_controlled("alpaka" REQUIRED PREFIX mallocMC) + +if(NOT TARGET mallocMC) + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) +endif() + +# ---- Create standalone executable ---- + +alpaka_add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cpp) + +set_target_properties(${PROJECT_NAME} + PROPERTIES + CXX_STANDARD 20 + OUTPUT_NAME ${PROJECT_NAME} + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka) + +add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME}) diff --git a/examples/vectorAdd/source/main.cpp b/examples/vectorAdd/source/main.cpp new file mode 100644 index 0000000000..461cef9703 --- /dev/null +++ b/examples/vectorAdd/source/main.cpp @@ -0,0 +1,249 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include + +#include + +#include +#include +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatter; +using mallocMC::CreationPolicies::OldMalloc; +using mallocMC::CreationPolicies::Scatter; + +using Dim = alpaka::DimInt<1>; +using Idx = std::size_t; + +// Define the device accelerator +using Acc = alpaka::ExampleDefaultAcc; + +constexpr uint32_t const blocksize = 2U * 1024U * 1024U; +constexpr uint32_t const pagesize = 4U * 1024U; +constexpr uint32_t const wasteFactor = 1U; + +// This happens to also work for the original Scatter algorithm, so we only define one. +struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig +{ + static constexpr auto accessblocksize = blocksize; + static constexpr auto pagesize = ::pagesize; + static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; + // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = wasteFactor; +}; + +struct XMallocConfig +{ + static constexpr auto pagesize = FlatterScatterHeapConfig::pagesize; +}; + +struct ShrinkConfig +{ + static constexpr auto dataAlignment = 16; +}; + +ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA; +ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB; +ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC; + +template< + typename T_CreationPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink> +auto example01() -> int +{ + using Allocator = mallocMC::Allocator< + alpaka::AccToTag, + T_CreationPolicy, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + constexpr auto length = 100; + + auto const platform = alpaka::Platform{}; + auto const dev = alpaka::getDevByIdx(platform, 0); + auto queue = alpaka::Queue{dev}; + + auto const devProps = alpaka::getAccDevProps(dev); + unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); + + // round up + auto grid = (length + block - 1U) / block; + assert(length <= block * grid); // necessary for used algorithm + + // init the heap + std::cerr << "initHeap..."; + auto const heapSize = 2U * 1024U * 1024U * 1024U; + Allocator scatterAlloc(dev, queue, heapSize); // 1GB for device-side malloc + std::cerr << "done\n"; + std::cout << Allocator::info("\n") << '\n'; + + // create arrays of arrays on the device + { + auto createArrayPointers + = [] ALPAKA_FN_ACC(Acc const& acc, int x, int y, Allocator::AllocatorHandle allocHandle) + { + arA = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); + arB = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); + arC = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); + }; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDiv, + createArrayPointers, + grid, + block, + scatterAlloc.getAllocatorHandle())); + } + + // fill 2 of them all with ascending values + { + auto fillArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, Allocator::AllocatorHandle allocHandle) + { + auto const id = alpaka::getIdx(acc)[0]; + + arA[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); + arB[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); + arC[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); + + for(int i = 0; i < localLength; ++i) + { + arA[id][i] = static_cast(id * localLength + i); + arB[id][i] = static_cast(id * localLength + i); + } + }; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv, fillArrays, length, scatterAlloc.getAllocatorHandle())); + } + + // add the 2 arrays (vector addition within each thread) + // and do a thread-wise reduce to sums + { + auto sumsBufferAcc = alpaka::allocBuf(dev, Idx{block * grid}); + + auto addArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, int* sums) + { + auto const id = alpaka::getIdx(acc)[0]; + + sums[id] = 0; + for(int i = 0; i < localLength; ++i) + { + arC[id][i] = arA[id][i] + arB[id][i]; + sums[id] += arC[id][i]; + } + }; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv, addArrays, length, alpaka::getPtrNative(sumsBufferAcc))); + + auto const platformCPU = alpaka::Platform{}; + auto const hostDev = alpaka::getDevByIdx(platformCPU, 0); + + auto sumsBufferHost = alpaka::allocBuf(hostDev, Idx{block * grid}); + alpaka::memcpy(queue, sumsBufferHost, sumsBufferAcc, Idx{block * grid}); + alpaka::wait(queue); + + auto const* sumsPtr = alpaka::getPtrNative(sumsBufferHost); + auto const sum = std::accumulate(sumsPtr, sumsPtr + block * grid, size_t{0}); + std::cout << "The sum of the arrays on GPU is " << sum << '\n'; + } + + auto const n = static_cast(block * grid * length); + auto const gaussian = n * (n - 1); + std::cout << "The gaussian sum as comparison: " << gaussian << '\n'; + + /*constexpr*/ if(mallocMC::Traits::providesAvailableSlots) + { + std::cout << "there are "; + std::cout << scatterAlloc.getAvailableSlots(dev, queue, 1024U * 1024U); + std::cout << " Slots of size 1MB available\n"; + } + + { + auto freeArrays = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) + { + auto const id = alpaka::getIdx(acc)[0]; + allocHandle.free(acc, arA[id]); + allocHandle.free(acc, arB[id]); + allocHandle.free(acc, arC[id]); + }; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, freeArrays, scatterAlloc.getAllocatorHandle())); + } + + { + auto freeArrayPointers = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) + { + allocHandle.free(acc, arA); + allocHandle.free(acc, arB); + allocHandle.free(acc, arC); + }; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv, freeArrayPointers, scatterAlloc.getAllocatorHandle())); + } + + return 0; +} + +auto main(int /*argc*/, char* /*argv*/[]) -> int +{ + example01, mallocMC::ReservePoolPolicies::AlpakaBuf>(); + example01, mallocMC::ReservePoolPolicies::AlpakaBuf>(); + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +# ifdef mallocMC_HAS_Gallatin_AVAILABLE + example01< + mallocMC::CreationPolicies::GallatinCuda<>, + mallocMC::ReservePoolPolicies::Noop, + mallocMC::AlignmentPolicies::Noop>(); + // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time. + example01(); +# else + // This should normally be: + example01(); +# endif +#else + example01(); +#endif + return 0; +} diff --git a/include/mallocMC/alignmentPolicies/Noop.hpp b/include/mallocMC/alignmentPolicies/Noop.hpp new file mode 100644 index 0000000000..ee176187c2 --- /dev/null +++ b/include/mallocMC/alignmentPolicies/Noop.hpp @@ -0,0 +1,69 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "Noop.hpp" + +#include + +#include +#include +#include + +namespace mallocMC +{ + namespace AlignmentPolicies + { + /** + * @brief a policy that does nothing + * + * This AlignmentPolicy will not perform any distribution, but only + * return its input (identity function) + */ + class Noop + { + public: + static auto alignPool(void* memory, size_t memsize) -> std::tuple + { + return std::make_tuple(memory, memsize); + } + + ALPAKA_FN_HOST_ACC + static auto applyPadding(uint32_t bytes) -> uint32_t + { + return bytes; + } + + static auto classname() -> std::string + { + return "Noop"; + } + }; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/include/mallocMC/alignmentPolicies/Shrink.hpp b/include/mallocMC/alignmentPolicies/Shrink.hpp new file mode 100644 index 0000000000..0eb495e975 --- /dev/null +++ b/include/mallocMC/alignmentPolicies/Shrink.hpp @@ -0,0 +1,151 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "Shrink.hpp" + +#include + +#include +#include +#include +#include + +namespace mallocMC +{ + namespace AlignmentPolicies + { + namespace Shrink2NS + { + template + struct __PointerEquivalent + { + using type = unsigned int; + }; + + template<> + struct __PointerEquivalent<8> + { + using type = unsigned long long; + }; + } // namespace Shrink2NS + + namespace ShrinkConfig + { + struct DefaultShrinkConfig + { + static constexpr auto dataAlignment = 16; + }; + } // namespace ShrinkConfig + + /** + * @brief Provides proper alignment of pool and pads memory requests + * + * This AlignmentPolicy is based on ideas from ScatterAlloc + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). + * It performs alignment operations on big memory pools and requests to + * allocate memory. Memory pools are truncated at the beginning until + * the pointer to the memory fits the alignment. Requests to allocate + * memory are padded until their size is a multiple of the alignment. + * + * @tparam T_Config (optional) The alignment to use + */ + template + class Shrink + { + public: + using Properties = T_Config; + + private: + using PointerEquivalent = Shrink2NS::__PointerEquivalent::type; + +/** Allow for a hierarchical validation of parameters: + * + * shipped default-parameters (in the inherited struct) have lowest precedence. + * They will be overridden by a given configuration struct. However, even the + * given configuration struct can be overridden by compile-time command line + * parameters (e.g. -D MALLOCMC_AP_SHRINK_DATAALIGNMENT 128) + * + * default-struct < template-struct < command-line parameter + */ +#ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT +# define MALLOCMC_AP_SHRINK_DATAALIGNMENT (Properties::dataAlignment) +#endif + static constexpr size_t dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT; + + // dataAlignment must be a power of 2! + static_assert( + dataAlignment != 0 && (dataAlignment & (dataAlignment - 1)) == 0, + "dataAlignment must also be a power of 2"); + + public: + static auto alignPool(void* memory, size_t memsize) -> std::tuple + { + PointerEquivalent alignmentstatus = ((PointerEquivalent) memory) & (dataAlignment - 1); + if(alignmentstatus != 0) + { + std::cout << "Heap Warning: memory to use not " << dataAlignment << " byte aligned...\n" + << "Before:\n" + << "dataAlignment: " << dataAlignment << '\n' + << "Alignmentstatus: " << alignmentstatus << '\n' + << "size_t memsize " << memsize << " byte" << '\n' + << "void *memory " << memory << '\n'; + + memory = (void*) (((PointerEquivalent) memory) + dataAlignment - alignmentstatus); + memsize -= dataAlignment + (size_t) alignmentstatus; + + std::cout << "Was shrunk automatically to: " << '\n' + << "size_t memsize " << memsize << " byte" << '\n' + << "void *memory " << memory << '\n'; + } + + return std::make_tuple(memory, memsize); + } + + ALPAKA_FN_HOST_ACC + static auto applyPadding(uint32_t bytes) -> uint32_t + { + constexpr uint32_t bitsToClear = dataAlignment - 1; + return (bytes + bitsToClear) & ~bitsToClear; + } + + ALPAKA_FN_HOST + static auto classname() -> std::string + { + std::stringstream ss; + ss << "Shrink[" << dataAlignment << "]"; + return ss.str(); + } + }; + + } // namespace AlignmentPolicies +} // namespace mallocMC diff --git a/include/mallocMC/allocator.hpp b/include/mallocMC/allocator.hpp new file mode 100644 index 0000000000..447b381d73 --- /dev/null +++ b/include/mallocMC/allocator.hpp @@ -0,0 +1,242 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "device_allocator.hpp" +#include "mallocMC_allocator_handle.hpp" +#include "mallocMC_constraints.hpp" +#include "mallocMC_traits.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace mallocMC +{ + namespace detail + { + template + struct GetAvailableSlotsIfAvailHost + { + template + ALPAKA_FN_HOST static auto getAvailableSlots(AlpakaDevice&, AlpakaQueue&, size_t, T_Allocator&) -> unsigned + { + return 0; + } + }; + + template + struct GetAvailableSlotsIfAvailHost + { + template + ALPAKA_FN_HOST static auto getAvailableSlots( + AlpakaDevice& dev, + AlpakaQueue& queue, + size_t slotSize, + T_Allocator& alloc) -> unsigned + { + return T_Allocator::CreationPolicy::template getAvailableSlotsHost( + dev, + queue, + slotSize, + alloc.getAllocatorHandle().devAllocator); + } + }; + } // namespace detail + + struct HeapInfo + { + void* p; + size_t size; + }; + + /** + * @brief "HostClass" that combines all policies to a useful allocator + * + * This class implements the necessary glue-logic to form an actual + * allocator from the provided policies. It implements the public interface + * and executes some constraint checking based on an instance of the class + * PolicyConstraints. + * + * @tparam T_CreationPolicy The desired type of a CreationPolicy + * @tparam T_DistributionPolicy The desired type of a DistributionPolicy + * @tparam T_OOMPolicy The desired type of a OOMPolicy + * @tparam T_ReservePoolPolicy The desired type of a ReservePoolPolicy + * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy + */ + template< + typename T_AccTag, + typename T_CreationPolicy, + typename T_DistributionPolicy, + typename T_OOMPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy> + class Allocator + : public PolicyConstraints< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy> + { + using uint32 = std::uint32_t; + + public: + using Dim = alpaka::DimInt<1>; + using Idx = std::uint32_t; + using AlpakaAcc = alpaka::TagToAcc; + using DistributionPolicy = T_DistributionPolicy; + using OOMPolicy = T_OOMPolicy; + using ReservePoolPolicy = T_ReservePoolPolicy; + using AlignmentPolicy = T_AlignmentPolicy; + using CreationPolicy = T_CreationPolicy::template AlignmentAwarePolicy; + using HeapInfoVector = std::vector; + using DevAllocator = DeviceAllocator; + using AllocatorHandle = AllocatorHandleImpl; + + private: + ReservePoolPolicy reservePolicy; + using DevAllocatorStorageBufferType + = alpaka::Buf, DevAllocator, alpaka::DimInt<1>, int>; + std::unique_ptr + devAllocatorBuffer; // FIXME(bgruber): replace by std::optional<> + HeapInfo heapInfos; + + /** allocate heap memory + * + * @param size number of bytes + */ + template + ALPAKA_FN_HOST void alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) + { + void* pool = reservePolicy.setMemPool(dev, size); + std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); + + devAllocatorBuffer + = std::make_unique(alpaka::allocBuf(dev, 1)); + CreationPolicy::template initHeap( + dev, + queue, + alpaka::getPtrNative(*devAllocatorBuffer), + pool, + size); + + heapInfos.p = pool; + heapInfos.size = size; + } + + /** free all data structures + * + * Free all allocated memory. + * After this call the instance is an in invalid state + */ + ALPAKA_FN_HOST void free() + { + devAllocatorBuffer = {}; + reservePolicy.resetMemPool(); + heapInfos.size = 0; + heapInfos.p = nullptr; + } + + /* forbid to copy the allocator */ + ALPAKA_FN_HOST + Allocator(Allocator const&) = delete; + + public: + template + ALPAKA_FN_HOST Allocator(AlpakaDevice& dev, AlpakaQueue& queue, size_t size = 8U * 1024U * 1024U) + { + alloc(dev, queue, size); + } + + ALPAKA_FN_HOST + ~Allocator() + { + free(); + } + + /** destroy current heap data and resize the heap + * + * @param size number of bytes + */ + template + ALPAKA_FN_HOST void destructiveResize(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) + { + free(); + alloc(dev, queue, size); + } + + ALPAKA_FN_HOST + auto getAllocatorHandle() const -> AllocatorHandle + { + return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)}; + } + + ALPAKA_FN_HOST + operator AllocatorHandle() + { + return getAllocatorHandle(); + } + + ALPAKA_FN_HOST static auto info(std::string linebreak = " ") -> std::string + { + std::stringstream ss; + ss << "CreationPolicy: " << CreationPolicy::classname() << " " << linebreak; + ss << "DistributionPolicy: " << DistributionPolicy::classname() << "" << linebreak; + ss << "OOMPolicy: " << OOMPolicy::classname() << " " << linebreak; + ss << "ReservePoolPolicy: " << ReservePoolPolicy::classname() << " " << linebreak; + ss << "AlignmentPolicy: " << AlignmentPolicy::classname() << " " << linebreak; + return ss.str(); + } + + // polymorphism over the availability of getAvailableSlots for calling + // from the host + template + ALPAKA_FN_HOST auto getAvailableSlots(AlpakaDevice& dev, AlpakaQueue& queue, size_t slotSize) -> unsigned + { + slotSize = AlignmentPolicy::applyPadding(slotSize); + return detail::GetAvailableSlotsIfAvailHost::providesAvailableSlots>:: + template getAvailableSlots(dev, queue, slotSize, *this); + } + + ALPAKA_FN_HOST + auto getHeapLocations() -> HeapInfoVector + { + HeapInfoVector v; + v.push_back(heapInfos); + return v; + } + }; + +} // namespace mallocMC diff --git a/include/mallocMC/creationPolicies/FlatterScatter.hpp b/include/mallocMC/creationPolicies/FlatterScatter.hpp new file mode 100644 index 0000000000..5931e9a0b4 --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter.hpp @@ -0,0 +1,495 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class Heap + * @brief Main interface to our heap memory. + * + * This class stores the heap pointer and the heap size and provides the high-level functionality to interact with + * the memory within kernels. It is wrapped in a thin layer of creation policy to be instantiated as base class of + * the `DeviceAllocator` for the user. + * + * @tparam T_HeapConfig Struct containing information about the heap. + * @tparam T_HashConfig Struct providing a hash function for scattering and the blockStride property. + * @tparam T_AlignmentPolicy The alignment policy used in the current configuration. + */ + template + struct Heap + { + using MyAccessBlock = AccessBlock; + + static_assert( + T_HeapConfig::accessblocksize + < std::numeric_limits>::max(), + "Your access block size must be smaller than the maximal value of its signed type because we are using " + "differences in the code occasionally."); + + static_assert( + T_HeapConfig::pagesize < std::numeric_limits>::max(), + "Your page size must be smaller than the maximal value of its signed type because we are using " + "differences in the code occasionally."); + + static_assert( + T_HeapConfig::accessblocksize == sizeof(MyAccessBlock), + "The real access block must have the same size as configured in order to make alignment more easily " + "predictable."); + + size_t heapSize{}; + MyAccessBlock* accessBlocks{}; + uint32_t volatile block = 0U; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto init(auto const& acc, void* accessBlocksPointer, auto heapSize) + -> void + { + auto threadsInGrid = alpaka::getWorkDiv(acc); + auto numThreads = threadsInGrid.prod(); + auto const [idx] = alpaka::mapIdx<1U>(alpaka::getIdx(acc), threadsInGrid); + auto* accessBlocks = static_cast(accessBlocksPointer); + + for(uint32_t i = idx; i < numBlocks(heapSize) * MyAccessBlock::numPages(); i += numThreads) + { + auto blockIdx = i / MyAccessBlock::numPages(); + auto pageIdx = i % MyAccessBlock::numPages(); + + accessBlocks[blockIdx].init(acc, pageIdx); + } + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& acc) -> void + { + init(acc, accessBlocks, heapSize); + } + + /** + * @brief Number of access blocks assuming the given heapSize. + * + * @return Number of access blocks in the heap. + */ + ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC static constexpr auto numBlocks(auto heapSize) -> uint32_t + { + return heapSize / T_HeapConfig::accessblocksize; + } + + /** + * @brief Number of access blocks in the heap. This is a runtime quantity because it depends on the given heap + * size. + * + * @return Number of access blocks in the heap. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBlocks() const -> uint32_t + { + return numBlocks(heapSize); + } + + /** + * @brief The dummy value to indicate the case of no free blocks found. + * + * @return An invalid block index for identifying such case. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBlockFound() const -> uint32_t + { + return numBlocks(); + } + + /** + * @brief Compute a starting index to search the access blocks for a valid piece of memory. + * + * @param blockValue Current starting index to compute the next one from. + * @param hashValue A hash value to provide some entropy for scattering the requests. + * @return An index to start search the access blocks from. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBlockIndex( + auto const& /*acc*/, + uint32_t const blockValue, + uint32_t const hashValue) + { + return ((hashValue % T_HashConfig::blockStride) + (blockValue * T_HashConfig::blockStride)) % numBlocks(); + } + + /** + * @brief Create a pointer to memory of (at least) `bytes` number of bytes.. + * + * @param bytes Size of the allocation in number of bytes. + * @return Pointer to the memory, nullptr if no usable memory was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t const bytes) -> void* + { + auto blockValue = block; + auto hashValue = T_HashConfig::template hash(acc, bytes); + auto startIdx = startBlockIndex(acc, blockValue, hashValue); + return wrappingLoop( + acc, + startIdx, + numBlocks(), + static_cast(nullptr), + [this, bytes, startIdx, &hashValue, blockValue](auto const& localAcc, auto const index) mutable + { + auto ptr = accessBlocks[index].create(localAcc, bytes, hashValue); + if(!ptr && index == startIdx) + { + // This is not thread-safe but we're fine with that. It's just a fuzzy thing to occasionally + // increment and it's totally okay if its value is not quite deterministic. + if(blockValue == block) + { + block = blockValue + 1; + } + } + return ptr; + }); + } + + /** + * @brief Counterpart free'ing operation to `create`. Destroys the memory at the pointer location. + * + * @param pointer A valid pointer created by `create()`.` + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(AlpakaAcc const& acc, void* pointer) -> void + { + // indexOf requires the access block size instead of blockSize in case the reinterpreted AccessBlock + // object is smaller than blockSize. + auto blockIndex = indexOf(pointer, accessBlocks, sizeof(MyAccessBlock)); + accessBlocks[blockIndex].destroy(acc, pointer); + } + + /** + * @brief Queries all access blocks how many chunks of the given chunksize they could allocate. This is + * single-threaded and NOT THREAD-SAFE but acquiring such distributed information while other threads operate + * on the heap is of limited value anyways. + * + * @param chunkSize Target would-be-created chunk size in number of bytes. + * @return The number of allocations that would still be possible with this chunk size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsDeviceFunction(auto const& acc, uint32_t const chunkSize) + -> size_t + { + // TODO(lenz): Not thread-safe. + return std::transform_reduce( + accessBlocks, + accessBlocks + numBlocks(), + 0U, + std::plus{}, + [&acc, chunkSize](auto& accessBlock) { return accessBlock.getAvailableSlots(acc, chunkSize); }); + } + + /** + * @brief Forwards to `getAvailableSlotsDeviceFunction` for interface compatibility reasons. See there for + * details. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(auto const& acc, uint32_t const chunkSize) + -> size_t + { + return getAvailableSlotsDeviceFunction(acc, chunkSize); + } + + protected: + // This class is supposed to be instantiated as a parent for the `DeviceAllocator`. + Heap() = default; + }; + + constexpr uint32_t defaultBlockSize = 128U * 1024U * 1024U; + constexpr uint32_t defaultPageSize = 128U * 1024U; + + /** + * @class DefaultHeapConfig + * @brief An example configuration for the heap. + * + * A heap configuration is supposed to provide the physical dimensions of the objects in the heap (i.e. access + * block and page) as well as a function that describes how much space you are willing to waste by allowing to + * allocate larger chunks that necessary. + * + * @tparam T_blockSize The size of one access block in bytes. + * @tparam T_pageSize The size of one page in bytes. + * @return + */ + template< + uint32_t T_blockSize = defaultBlockSize, + uint32_t T_pageSize = defaultPageSize, + uint32_t T_wasteFactor = 2U> + struct DefaultHeapConfig + { + static constexpr uint32_t const accessblocksize = T_blockSize; + static constexpr uint32_t const pagesize = T_pageSize; + static constexpr uint32_t const wastefactor = T_wasteFactor; + static constexpr bool const resetfreedpages = true; + + /** + * @brief Determine whether we want to allow an allocation of numBytes on a page with chunk size `chunkSize`. + * + * This function is given the currently requested allocation size numBytes and the set chunk size of a page. It + * answers the question whether we should consider this page for allocating this memory. It must necessarily + * return false if chunkSize < numBytes in order to avoid memory corruption. It may return true in cases where + * chunkSize > numBytes to trade off a bit of wasted memory for a performance boost while searching available + * memory. + * + * @param chunkSize Currently set chunk size of a page in number of bytes. + * @param numBytes Allocation size in number of bytes. + * @return true if the algorithm shall consider this page for allocation and false otherwise. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + return (chunkSize >= numBytes && chunkSize <= wastefactor * numBytes); + } + }; + + /** + * @class DefaultFlatterScatterHashConfig + * @brief An example configuration for the hash scattering. + * + * A scatter configuration is supposed to provide two pieces of information: A static function called `hash` and + * the compile-time constant `blockStride`. These are used by the creation policy to scatter the requests for + * memory within the heap. + * + */ + struct DefaultFlatterScatterHashConfig + { + public: + static constexpr uint32_t blockStride = 4; + + /** + * @brief Hash function to provide entropy for scattering memory requests. + * + * @param numBytes Number of bytes requested. + * @return A hash value. + */ + // TAcc is to be deduced, so we put it last. + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto hash(TAcc const& acc, uint32_t const numBytes) -> uint32_t + { + uint32_t const relative_offset = warpSize * numBytes / T_pageSize; + return ( + numBytes * hashingK + hashingDistMP * smid(acc) + + (hashingDistWP + hashingDistWPRel * relative_offset) * warpid(acc)); + } + + private: + static constexpr uint32_t hashingK = 38183; + static constexpr uint32_t hashingDistMP = 17497; + static constexpr uint32_t hashingDistWP = 1; + static constexpr uint32_t hashingDistWPRel = 1; + }; + + /** + * @class InitKernel + * @brief Kernel to initialise the heap memory. + * + * Used by the creation policy during initialisation. + */ + struct InitKernel + { + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()( + auto const& acc, + Heap* m_heap, + void* m_heapmem, + size_t const m_memsize) const + { + auto const idx = alpaka::mapIdx<1U>( + alpaka::getIdx(acc), + alpaka::getWorkDiv(acc)); + if(idx == 0) + { + m_heap->accessBlocks + = static_cast::MyAccessBlock*>(m_heapmem); + m_heap->heapSize = m_memsize; + } + // We can't rely on thread 0 to finish the above before we start, so we use the static version: + Heap::init(acc, m_heapmem, m_memsize); + } + }; + +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc + +namespace mallocMC::CreationPolicies +{ + /** + * @class FlatterScatter + * @brief A creation policy scattering memory requests in a flat hierarchy. + * + * This creation policy is a variation on the original ScatterAlloc algorithm and the one previously implemented in + * mallocMC. It provides a multi-level hierarchy of Heap, AccessBlock and DataPage that is traversed using the + * metadata held by each level to find a suitable memory location to satisfy the request. + * + * It uses a externally provided hash function to compute a single hash value for each request given its requested + * number of bytes and the accelerator. This is internally used to scatter the requests over the available memory + * and thereby improve the success rate for multi-threaded requests because different threads will start searching + * in different locations. + * + * Implemented as a thin wrapper around `Heap` that mainly provides interface compatibility with the calling code. + */ + template + struct FlatterScatterImpl + { + template + using AlignmentAwarePolicy = FlatterScatterAlloc::Heap; + + static auto classname() -> std::string + { + return "FlatterScatter"; + } + + static constexpr auto const providesAvailableSlots = true; + + /** + * @brief Check if a pointer returned from `create()` signals out-of-memory. + * + * @param pointer Pointer returned by `create()`. + * @return The boolean answer to this question. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isOOM(void* pointer, uint32_t const /*unused size*/) -> bool + { + return pointer == nullptr; + } + + /** + * @brief initialise a raw piece of memory for use by the `Heap`. + * + * @param dev The alpaka device. + * @param queue The alpaka queue. + * @param heap The pointer to the `Heap` object located on the device. + * @param pool The pointer to the provided memory pool to be used by the `Heap` object. + * @param memsize The size of the pool memory in bytes. + */ + template + static void initHeap([[maybe_unused]] auto& dev, auto& queue, auto* heap, void* pool, size_t memsize) + { + using MyHeap = FlatterScatterAlloc::Heap; + auto numBlocks = MyHeap::numBlocks(memsize); + if(numBlocks == 0U) + { + // This is not just an optimisation. The call to `getValidWorkDiv` below really dislikes the 0 extent + // that we'd give it, so better stop here to not run into division by zero. + return; + } + auto numPagesPerBlock = MyHeap::MyAccessBlock::numPages(); + + alpaka::KernelCfg const kernelCfg + = {numBlocks * numPagesPerBlock, 1U, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + auto workDiv + = alpaka::getValidWorkDiv(kernelCfg, dev, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize); + alpaka::exec(queue, workDiv, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize); + alpaka::wait(queue); + } + + /** + * @brief Count the number of possible allocation for the given slotSize directly from the host. + * + * This method implements the infrastructure to call `getAvailableSlotsDeviceFunction` on the `Heap` class. See + * there for details, particularly concerning the thread-safety of this. + * + * @param dev The alpaka device. + * @param queue The alpaka queue. + * @param slotSize The would-be-created memory size in number of bytes. + * @param heap Pointer to the `Heap` object that's supposed to handle the request. + * @return The number of allocations that would be successful with this slotSize. + */ + template + static auto getAvailableSlotsHost( + AlpakaDevice& dev, + AlpakaQueue& queue, + uint32_t const slotSize, + T_DeviceAllocator* heap) -> unsigned + { + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto d_slots = alpaka::allocBuf(dev, uint32_t{1}); + alpaka::memset(queue, d_slots, 0, uint32_t{1}); + auto d_slotsPtr = alpaka::getPtrNative(d_slots); + + auto getAvailableSlotsKernel = [heap, slotSize, d_slotsPtr] ALPAKA_FN_ACC(AlpakaAcc const& acc) -> void + { *d_slotsPtr = heap->getAvailableSlotsDeviceFunction(acc, slotSize); }; + + alpaka::wait(queue); + alpaka::exec( + queue, + alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}, + getAvailableSlotsKernel); + alpaka::wait(queue); + + auto const platform = alpaka::Platform{}; + auto const hostDev = alpaka::getDevByIdx(platform, 0); + + auto h_slots = alpaka::allocBuf(hostDev, Idx{1}); + alpaka::memcpy(queue, h_slots, d_slots); + alpaka::wait(queue); + + return *alpaka::getPtrNative(h_slots); + } + }; + + template< + typename T_HeapConfig = FlatterScatterAlloc::DefaultHeapConfig<>, + typename T_HashConfig = FlatterScatterAlloc::DefaultFlatterScatterHashConfig, + typename T_AlignmentPolicy = void> + struct FlatterScatter + { + template + using AlignmentAwarePolicy = FlatterScatterImpl; + + struct Properties + { + using HeapConfig = T_HeapConfig; + using HashConfig = T_HashConfig; + }; + }; + + +} // namespace mallocMC::CreationPolicies diff --git a/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp b/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp new file mode 100644 index 0000000000..cf6ad651a1 --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp @@ -0,0 +1,858 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + + /** + * @class PageTable + * @brief Storage for AccessBlock's metadata + */ + template + struct PageTable + { + uint32_t chunkSizes[T_numPages]{}; + uint32_t fillingLevels[T_numPages]{}; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanup() -> void + { + std::fill(std::begin(chunkSizes), std::end(chunkSizes), 0U); + std::fill(std::begin(fillingLevels), std::end(fillingLevels), 0U); + } + }; + + /** + * @class Padding + * @brief Empty memory to pad the AccessBlock to the correct size + */ + template + struct Padding + { + char padding[T_size]{}; + }; + + /** + * @brief The C++ standard disallows zero-size arrays, so we specialise for this case. + */ + template<> + struct Padding<0U> + { + }; + + /** + * @class AccessBlock + * @brief Coarsest memory division unit containing fixed-size pages of raw memory and metadata about their chunk + * size and filling level + * + * @tparam T_HeapConfig A struct with compile-time information about the setup + * @tparam T_AlignmentPolicy The alignment policy in use for optimisation purposes + */ + template + class AccessBlock + { + protected: + static constexpr uint32_t const blockSize = T_HeapConfig::accessblocksize; + static constexpr uint32_t const pageSize = T_HeapConfig::pagesize; + static constexpr uint32_t const wasteFactor = T_HeapConfig::wastefactor; + static constexpr bool const resetfreedpages = T_HeapConfig::resetfreedpages; + + using MyPageInterpretation = PageInterpretation; + + // This class is supposed to be reinterpeted on a piece of raw memory and not instantiated directly. We set it + // protected, so we can still test stuff in the future easily. + AccessBlock(auto const& acc) + { + init(acc); + } + + public: + /** + * @brief Single-threaded initialisation loop. Used only for testing. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& acc) -> void + { + for(uint32_t i = 0; i < numPages(); i++) + { + init(acc, i); + } + } + + /** + * @brief Initialise the page given by its index. 0th also initialises the pageTable. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& /*acc*/, auto const pageIdx) -> void + { + if(pageIdx == 0U) + { + pageTable.cleanup(); + } + constexpr uint32_t dummyChunkSize = 1U; + if(pageIdx < numPages()) + { + interpret(pageIdx, dummyChunkSize).cleanupFull(); + } + } + + /** + * @brief Compute the number of pages in the access block taking into account the space needed for metadata. + * + * @return The number of pages in the access block. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numPages() -> uint32_t + { + constexpr auto numberOfPages = blockSize / (pageSize + sizeof(PageTable<1>)); + // check that the page table entries does not have a padding + static_assert(sizeof(PageTable) == numberOfPages * sizeof(PageTable<1>)); + return numberOfPages; + } + + /** + * @brief Answers the question: How many successful allocations with the given size are still possible? + * CAUTION: Not thread-safe! + * + * This method looks up the metadata for all its pages and computes the number of available slots with the + * given chunk size. By doing so, the information this method is queried for is inherently not thread-safe + * because if other threads are (de-)allocating memory during this look up, the information about each + * individual page will be stale as soon as it is retrieved. However, beyond this inherent non-thread-safety we + * made no effort so far to leverage parallelism or make it use atomics, i.e., move into the direction of + * consistency in the multi-threaded case. It is supposed to run in a single thread without any interference. + * + * @param chunkSize The number of bytes the would-be allocations request + * @return The number of available slots with this chunk size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlots(auto const& acc, uint32_t const chunkSize) const + -> uint32_t + { + if(chunkSize < multiPageThreshold()) + { + return getAvailableChunks(acc, chunkSize); + } + return getAvailableMultiPages(acc, chunkSize); + } + + /** + * @brief Compute the index of the page a pointer points to. + * + * @param pointer Memory location inside of the data part of this access block. + * @return The index of the page this pointer points to. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto pageIndex(void* pointer) const -> int32_t + { + return indexOf(pointer, pages, pageSize); + } + + /** + * @brief Verifies that a pointer points to a valid piece of memory. CAUTION: Not thread-safe! + * + * This method checks if a pointer is valid, meaning that it points to a chunk of memory that is marked as + * allocated. The information it provides is inherently not thread-safe because if other threads are operating + * on the memory, the retrieved information is stale the moment it was looked up. It is, however, consistent in + * that it uses atomics to retrieve this information, so if the pointer is valid and does not get destroyed + * between looking up the answer and using it (for example in the scenario where I'm the only one knowing about + * this pointer), the answer is valid. + * + * @param pointer Pointer to validate + * @return true if the pointer is valid else false + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) -> bool + { + if(pointer == nullptr) + { + return false; + } + auto const index = pageIndex(pointer); + auto chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); + if(chunkSize >= pageSize) + { + return true; + } + return chunkSize == 0U or atomicLoad(acc, pageTable.fillingLevels[index]) == 0U + ? false + : interpret(index, chunkSize).isValid(acc, pointer); + } + + /** + * @brief Allocate a piece of memory of the given size. + * + * This method attempts to allocate a piece of memory of (at least) numBytes bytes. The actual size might be + * larger (depending on the user-provided compile-time configuration of the AccessBlock) but is not + * communicated, so it is not allowed to access the pointer outside the requested range. It returns a nullptr + * if there is no memory available. The hashValue is used to scatter memory accesses. A cheap operation will be + * performed to transform it into a page index to start the search at. It is also handed to the lower levels to + * be used similarly. Having it default to 0 makes it easier for testing. The effect of this method is reverted + * by the destroy method. + * + * @param numBytes Required size of memory in bytes + * @param hashValue Optional number to scatter memory access. + * @return A pointer to an allocated piece of memory or nullptr if no memory is available + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const hashValue = 0U) -> void* + { + void* pointer{nullptr}; + if(numBytes >= multiPageThreshold()) + { + pointer = createOverMultiplePages(acc, numBytes, hashValue); + } + else + { + pointer = createChunk(acc, numBytes, hashValue); + } + return pointer; + } + + /** + * @brief Free up the memory a valid pointer points to. + * + * This method attempts to destroy the memory of a valid pointer created by the create method. It reverses the + * effect of the create method and makes the allocated memory available for re-allocation. After calling this + * method on a pointer it is invalid and may no longer be used for memory access. Invalid pointers are ignored + * and a failure of this method is not communicated in production. In debug mode various exceptions can be + * thrown for different forms of invalid pointers. + * + * @param pointer A pointer created by the create method. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* const pointer) -> void + { + // CAUTION: This memfence is of utmost importance! As we are allowing a re-use of the chunk we're about to + // free, we need to make sure that any memory operation from the previous thread is executed before we can + // safely consider it free. If this is missing, an extended (non-atomic) write operation might not yet have + // finished when we unset the bit. In such a case, another thread might start using the memory while we're + // still writing to it, thus corrupting the new thread's data. It might even lead to us overwriting the + // bitmask itself, if the chunk size (and thereby the extent of the bitmask) changes before we finish. + // (The latter scenario might be excluded by other mem_fences in the code.) If a read is pending, the old + // thread might read data from the new thread leading to inconsistent information in the first thread. + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + + auto const index = pageIndex(pointer); + if(index >= static_cast(numPages()) || index < 0) + { +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + throw std::runtime_error{ + "Attempted to destroy an invalid pointer! Pointer does not point to any page."}; +#endif // NDEBUG + return; + } + auto const chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); + if(chunkSize >= multiPageThreshold()) + { + destroyOverMultiplePages(acc, index, chunkSize); + } + else + { + destroyChunk(acc, pointer, index, chunkSize); + } + } + + private: + DataPage pages[numPages()]{}; + PageTable pageTable{}; + Padding padding{}; + + /** + * @brief The number of bytes at which allocation switch to "multi-page mode", i.e., allocate full pages. + * + * It is obvious that this number can be at most page size subtracted by the size of one bit mask. There is, + * however, no strict lower bound because we theoretically disregard the lower levels completely by this + * switch. If we reasonably assume that our lower hierarchy levels add value (i.e. performance) to our + * implementation, a reasonable lower bound would be the size at which only a single allocation fits onto a + * page. This method could be used for fine-tuning performance in that sense. + * + * @return The number of bytes at which to switch to multi-page mode. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto multiPageThreshold() -> uint32_t + { + return ceilingDivision(pageSize - sizeof(BitMaskStorageType<>), 2U); + } + + /** + * @brief Convenience method that creates a PageInterpretation from a page identified by its page index and a + * chunk size. + * + * @param pageIndex Identifies the page in the array of raw pages. + * @param chunkSize Chunk size for which to interpret the page. + * @return A page interpretation of the requested page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto interpret(uint32_t const pageIndex, uint32_t const chunkSize) + { + return MyPageInterpretation(pages[pageIndex], chunkSize); + } + + /** + * @brief Branch of getAvailableSlots for chunk sizes below the multi-page threshold. See there for details. + * + * @param chunkSize Would-be allocation size to test for. + * @return Number of allocations that would succeed with this size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableChunks(auto const& acc, uint32_t const chunkSize) const + -> uint32_t + { + // TODO(lenz): This is not thread-safe! + return std::transform_reduce( + std::cbegin(pageTable.chunkSizes), + std::cend(pageTable.chunkSizes), + std::cbegin(pageTable.fillingLevels), + 0U, + std::plus{}, + [this, &acc, chunkSize](auto const localChunkSize, auto const fillingLevel) + { + auto const numChunks + = MyPageInterpretation::numChunks(localChunkSize == 0 ? chunkSize : localChunkSize); + return ((this->isInAllowedRange(acc, localChunkSize, chunkSize) or localChunkSize == 0U) + and fillingLevel < numChunks) + ? numChunks - fillingLevel + : 0U; + }); + } + + /** + * @brief Branch of getAvailableSlots for chunk sizes above the multi-page threshold. See there for details. + * + * @param chunkSize Would-be allocation size to test for. + * @return Number of allocations that would succeed with this size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableMultiPages(auto const& /*acc*/, uint32_t const chunkSize) const + -> uint32_t + { + // TODO(lenz): This is not thread-safe! + auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); + if(numPagesNeeded > numPages()) + { + return 0U; + } + uint32_t sum = 0U; + for(uint32_t i = 0; i < numPages() - numPagesNeeded + 1;) + { + if(std::all_of( + pageTable.chunkSizes + i, + pageTable.chunkSizes + i + numPagesNeeded, + [](auto const& val) { return val == 0U; })) + { + sum += 1; + i += numPagesNeeded; + } + else + { + ++i; + } + } + return sum; + } + + /** + * @brief Creation algorithm in multi-page mode. + * + * In this mode, we have decided to ignore all the lower level hierarchy. The algorithm simplifies accordingly + * and a few optimisations can be done. It can however be quite cumbersome to find a sufficient number of + * contiguous pages, so this will likely be most performant for small sizes. + * + * @param numBytes Required allocation size in number of bytes. + * @param hashValue A hash value used to scatter memory access. + * @return Pointer to a valid piece of memory or nullptr if no such memory was found. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createOverMultiplePages( + auto const& acc, + uint32_t const numBytes, + uint32_t hashValue) -> void* + { + auto numPagesNeeded = ceilingDivision(numBytes, +pageSize); + if(numPagesNeeded > numPages()) + { + return static_cast(nullptr); + } + + // We take a little head start compared to the chunked case in order to not have them interfere with our + // laborious search for contiguous pages. + auto startIndex = startPageIndex(acc, hashValue) + numPagesNeeded; + return wrappingLoop( + acc, + startIndex, + numPages() - (numPagesNeeded - 1), + static_cast(nullptr), + [&](auto const& localAcc, auto const& firstIndex) + { + void* result{nullptr}; + auto numPagesAcquired = acquirePages(localAcc, firstIndex, numPagesNeeded); + if(numPagesAcquired == numPagesNeeded) + { + // At this point, we have acquired all the pages we need and nobody can mess with them anymore. + // We still have to set the chunk size correctly. + setChunkSizes(localAcc, firstIndex, numPagesNeeded, numBytes); + result = &pages[firstIndex]; + } + else + { + releasePages(localAcc, firstIndex, numPagesAcquired); + } + return result; + }); + } + + /** + * @brief Short-circuiting acquisition of multiple contiguous pages. + * + * The algorithm attempts to acquire the requested number of pages starting from firstIndex locking them by + * setting their filling level to page size. It returns when either all requested pages are acquired or an + * already occupied page was hit. In either case, it returns the number of successful acquisitions. This method + * does not clean up after itself, i.e., it does not release the pages in case of failure. + * + * @param firstIndex Start index of the array of contiguous pages. + * @param numPagesNeeded Number of pages to be acquired. + * @return Number of pages that were successfully acquired. This is smaller than numPagesNeeded if the method + * failed. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto acquirePages( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesNeeded) -> uint32_t + { + uint32_t index = 0U; + uint32_t oldFilling = 0U; + for(index = 0U; index < numPagesNeeded; ++index) + { + oldFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[firstIndex + index], 0U, +pageSize); + if(oldFilling != 0U) + { + break; + } + } + return index; + } + + /** + * @brief Counterpart to acquirePages for doing the clean-up in case of failure. + * + * This method starts from page firstIndex and releases the lock of numPagesAcquired contiguous pages. This is + * supposed to be called in the case of failure of acquirePages to release the already acquired pages. + * + * @param firstIndex Start index of the array of contiguous pages. + * @param numPagesAcquired Number of pages to be released. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto releasePages( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesAcquired) -> void + { + for(uint32_t index = 0U; index < numPagesAcquired; ++index) + { + alpaka::atomicSub(acc, &pageTable.fillingLevels[firstIndex + index], +pageSize); + } + } + + /** + * @brief Set the chunk sizes of a contiguous array of pages. + * + * This function assumes that all the pages are locked by the current thread and performs a hard set operation + * without checking the previous content. + * + * @param firstIndex Start index of the contiguous array of pages. + * @param numPagesNeeded The number of pages to set the chunk size on. + * @param numBytes Chunk size to be set in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto setChunkSizes( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesNeeded, + uint32_t const numBytes) -> void + { + for(uint32_t numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired) + { + // At this point in the code, we have already locked all the pages. So, we literally don't care what + // other threads thought this chunk size would be because we are the only ones legitimately messing + // with this page. This chunk size may be non-zero because we could have taken over a page before it + // was properly cleaned up. That is okay for us because we're handing out uninitialised memory anyways. + // But it is very important to record the correct chunk size here, so the destroy method later on knows + // how to handle this memory. + alpaka::atomicExch(acc, &pageTable.chunkSizes[firstIndex + numPagesAcquired], numBytes); + } + } + + /** + * @brief Special return value for an unsuccessful search of available pages. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreePageFound() + { + return numPages(); + } + + /** + * @brief Compute an index where to start searching for a free page from a hash value. + * + * @param hashValue Hash value to introduce some entropy here. + * @return Start index for searching a free page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startPageIndex(auto const& /*acc*/, uint32_t const hashValue) + { + return (hashValue >> 8U) % numPages(); + } + + /** + * @brief Helper that combines the necessary checks to ensure a page index is valid. + * + * @param index The page index to check. + * @return true if the page index is valid and false otherwise + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValidPageIdx(uint32_t const index) const -> bool + { + return index != noFreePageFound() && index < numPages(); + } + + /** + * @brief Main algorithm to create a chunk of memory on a page. + * + * This is the main algorithm for creating a chunk of memory. It searches for a free page and instructs it to + * create some memory. If successful, it returns this pointer. If not, it searches on. + * + * @param numBytes Number of bytes required. + * @param hashValue A hash value used to scatter the memory accesses. + * @return A pointer to a valid piece of memory or nullptr if no available memory could be found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createChunk( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const hashValue) -> void* + { + auto index = startPageIndex(acc, hashValue); + + // Under high pressure, this loop could potentially run for a long time because the information where and + // when we started our search is not maintained and/or used. This is a feature, not a bug: Given a + // consistent state, the loop will terminate once a free chunk is found or when all chunks are filled for + // long enough that `choosePage` could verify that each page is filled in a single run. + // + // The seemingly non-terminating behaviour that we wrap around multiple times can only occur (assuming a + // consistent, valid state of the data) when there is high demand for memory such that pages that appear + // free to `choosePage` are repeatedly found but then the free chunks are scooped away by other threads. + // + // In the latter case, it is considered desirable to wrap around multiple times until the thread was fast + // enough to acquire some memory. + void* pointer = nullptr; + do + { + // TODO(lenz): This can probably be index++. + index = (index + 1) % numPages(); + uint32_t chunkSize = numBytes; + index = choosePage(acc, numBytes, index, chunkSize); + if(isValidPageIdx(index)) + { + pointer = MyPageInterpretation{pages[index], chunkSize}.create(acc, hashValue); + if(pointer == nullptr) + { + leavePage(acc, index); + } + } + } while(isValidPageIdx(index) and pointer == nullptr); + return pointer; + } + + /** + * @brief Main loop running over all pages checking for available ones. + * + * It is important to stress that the information about availability of the returned page is already stale when + * it is returned. Thus, it can well happen that an actual allocation attempt on this page still fails, e.g., + * because another thread was faster and scooped away that piece of memory. + * + * @param numBytes Required allocation size in number of bytes. + * @param startIndex Index of the page to start the search from. + * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for + * optimisation by reducing the number of atomic lookups. + * @return A page index to a potntially available page or noFreePageFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto choosePage( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const startIndex, + uint32_t& chunkSizeCache) -> uint32_t + { + return wrappingLoop( + acc, + startIndex, + numPages(), + noFreePageFound(), + [this, numBytes, &chunkSizeCache](auto const& localAcc, auto const index) + { + return this->thisPageIsSuitable(localAcc, index, numBytes, chunkSizeCache) ? index + : noFreePageFound(); + }); + } + + /** + * @brief Helper function combining checks to match the requested number of bytes with a found chunk size + * taking into account the waste factor. + * + * @param chunkSize Actually found chunk sizes of a page in number of bytes + * @param numBytes Requested allocation size in number of bytes. + * @return + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isInAllowedRange( + auto const& acc, + uint32_t const chunkSize, + uint32_t const numBytes) const + { + return T_HeapConfig::isInAllowedRange(acc, chunkSize, numBytes); + } + + /** + * @brief Checks if a page is usable for allocation of numBytes and enters it. + * + * This method looks up the metdata of the page identified by its index to check if we can hope for a + * successful allocation there. In doing so, it enters the page (i.e. increments its filling level) and, if + * necessary, already sets the correct chunk size. In a multi-threaded context the separate concerns of + * checking and setting cannot be split because the information used for the check would already be stale at + * the time of setting anything. If it returns true, the filling level and chunk sizes are thus suitable for + * proceeding further and the caller is responsible for cleaning up appropriately if a failure at a later stage + * occurs. If it returns false, it has already cleaned up everything itself and there is no further action + * required on the caller's side. + * + * @param index Index to identify the page among the raw data pages. + * @param numBytes Requested allocation size in number of bytes. + * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for + * optimisation by reducing the number of atomic lookups. + * @return true if the page is suitable and false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto thisPageIsSuitable( + TAcc const& acc, + uint32_t const index, + uint32_t const numBytes, + uint32_t& chunkSizeCache) -> bool + { + bool suitable = false; + auto oldFilling = enterPage(acc, index); + + // At this point, we're only testing against our desired `numBytes`. Due to the `wastefactor` the actual + // `chunkSize` of the page might be larger and, thus, the actual `numChunks` might be smaller than what + // we're testing for here. But if this fails already, we save one atomic. + if(oldFilling < MyPageInterpretation::numChunks(numBytes)) + { + uint32_t oldChunkSize = alpaka::atomicCas(acc, &pageTable.chunkSizes[index], 0U, numBytes); + chunkSizeCache = oldChunkSize == 0U ? numBytes : oldChunkSize; + + // Now that we know the real chunk size of the page, we can check again if our previous assessment was + // correct. But first we need to make sure that we are actually in chunked mode. This will be redundant + // with the second check in most situations because we usually would choose a multi-page threshold that + // would not switch to multi-page mode while more than one chunk fits on the page but this is a design + // decision that could change in the future. + if(oldChunkSize < multiPageThreshold() + and oldFilling < MyPageInterpretation::numChunks(chunkSizeCache)) + { + suitable = isInAllowedRange(acc, chunkSizeCache, numBytes); + } + } + if(not suitable) + { + leavePage(acc, index); + } + return suitable; + } + + /** + * @brief Counterpart to createChunk freeing up a piece of memory in the chunked mode. See destroy for details. + * + * This is the most difficult part of the algorithm. We will successively remove our metadata from the various + * levels and must be extra careful which information we can still rely on. Most of this complexity is captured + * in leavePage. + * + * @param pointer Pointer to a valid piece of memory created by createChunk. + * @param pageIndex Index of the page the pointer points to. Supplying this is an optimisation because it was + * already computed on a higher level in the call stack. This information would already be contained in + * pointer. + * @param chunkSize Chunk size of the page we're operating on. This is potentially different from the size of + * memory the pointer points to due to the waste factor. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyChunk( + TAcc const& acc, + void* pointer, + uint32_t const pageIndex, + uint32_t const chunkSize) + { + auto page = interpret(pageIndex, chunkSize); + page.destroy(acc, pointer); + leavePage(acc, pageIndex); + } + + /** + * @brief Enter a page for any purpose. + * + * This method is very important. We maintain the invariant that any potentially writing access to a page + * starts by entering and ends by leaving a page. These are currently implemented as updating the filling level + * accordingly. You are not allowed to touch a page unless you have entered it (although multi-page mode uses a + * shortcut here). This implies that we always have to check the filling level before checking for the chunk + * size. + * + * @param pageIndex Identifies the page in the array of raw data pages. + * @return The old filling level for further checks. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto enterPage(TAcc const& acc, uint32_t const pageIndex) -> uint32_t + { + auto const oldFilling = alpaka::atomicAdd(acc, &pageTable.fillingLevels[pageIndex], 1U); + // We assume that this page has the correct chunk size. If not, the chunk size is either 0 (and oldFilling + // must be 0, too) or the next check will fail. + return oldFilling; + } + + /** + * @brief Leave a page after any potentially modifying operation on it. + * + * This method must be called whenever you have entered a page (using enterPage()). This is a very subtle and + * error-prone method because we are successively removing metadata and need to be extra careful which + * information and guards we can still trust. In the simplest case, there's not much to do but decrease the + * filling level but potentially we're the last thread on the page and need to clean up remaining metadata for + * the threads to come. In that case, we explicitly allow for threads to take over the page as-is to spare us + * the trouble of cleaning up. But doing so opens up many subtle ways of reordering memory accesses. Also, we + * cannot rely in much previous information (like chunk sizes looked up earlier) because other threads might + * have already updated them. Be warned! + * + * @param pageIndex Identifies the page in the array of raw data pages. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void leavePage(TAcc const& acc, uint32_t const pageIndex) + { + // This outermost atomicSub is an optimisation: We can fast-track this if we are not responsible for the + // clean-up. Using 0U -> 1U in the atomicCAS and comparison further down would have the same effect (if the + // else branch contained the simple subtraction). It's a matter of which case shall have one operation + // less. + auto originalFilling = alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], 1U); + + if constexpr(resetfreedpages) + { + if(originalFilling == 1U) + { + // CAUTION: This section has caused a lot of headaches in the past. We're in a state where the + // filling level is 0 but we have not properly cleaned up the page and the metadata yet. This is on + // purpose because another thread might still take over this page and spare us the trouble of + // freeing everything up properly. But this other thread must take into account the possibility + // that it acquired a second-hand page. Look here if you run into another deadlock. It might well + // be related to this section. + + auto lock = pageSize; + auto latestFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[pageIndex], 0U, lock); + if(latestFilling == 0U) + { + auto chunkSize = alpaka::atomicExch(acc, &pageTable.chunkSizes[pageIndex], 0U); + + // If the chunkSize is found to be 0, another thread has already cleaned-up everything and + // we're done here. Otherwise, we're responsible and have to clean up. + // + // CAUTION: It is of utmost importance that we use the result of the atomic exchange here. This + // is to ensure that it has been evaluated and observed by other threads before we continue + // beyond this point (because we can only know the return value after we have evaulated it). + // (Although admittedly in this version, the existence of the mem_fence further below probably + // has a similar effect.) + // + // In a previous version, there were situations in which the change of the chunk size and the + // release of the lock further below were independent of each other. In this case, their + // execution could be observed in reverse order in other threads which led to the lock being + // observed as released before the chunk size was actually reset. When the chunk size setting + // finally arrived, it could corrupt the metadata another thread was already relying on leading + // to bad memory bugs. + if(chunkSize != 0) + { + // At this point it's guaranteed that the fiilling level is numChunks and thereby locked. + // Furthermore, chunkSize cannot have changed because we maintain the invariant that the + // filling level is always considered first, so no other thread can have passed that + // barrier to reset it. + MyPageInterpretation{pages[pageIndex], chunkSize}.cleanupUnused(); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + } + + // At this point, there might already be another thread (with another chunkSize) on this page + // but that's fine. It will see the lock and retreat. + alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], lock); + } + } + } + } + + /** + * @brief Counterpart to createOverMultiplePages, freeing up memory in multi-page mode. + * + * This method is way simpler than its chunked version because in multi-page mode we have a hard lock on the + * pages we acquired and are free to manipulate them to our will. We just make sure that releasing this lock is + * the last operation we perform. + * + * @param pageIndex Identifies the first page in the array of raw data pages. + * @param chunkSize The chunk size set on that first page (i.e. the original allocation size). + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyOverMultiplePages( + auto const& acc, + uint32_t const pageIndex, + uint32_t const chunkSize) + { + auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); + for(uint32_t i = 0; i < numPagesNeeded; ++i) + { + auto myIndex = pageIndex + i; + // Everything inside the following scope is done to reset the free'd pages. As opposed to the chunked + // case, we decided to always perform a reset in multi-page mode regardless of the value of + // `resetfreedpages`. If you want to reinstate the old behaviour or add a second parameter + // specifically for multi-page mode, e.g., resetreedpages_multipage, just put an `if constexpr` around + // here again. + { + MyPageInterpretation{pages[myIndex], T_AlignmentPolicy::Properties::dataAlignment}.cleanupFull(); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + alpaka::atomicCas(acc, &pageTable.chunkSizes[myIndex], chunkSize, 0U); + } + alpaka::atomicSub(acc, &pageTable.fillingLevels[myIndex], +pageSize); + } + } + }; + +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp b/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp new file mode 100644 index 0000000000..c7596c072d --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp @@ -0,0 +1,533 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include +#include + +#include + +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + namespace detail + { + template + struct BitMaskStorageTypes + { + using type = void; + }; + + template<> + struct BitMaskStorageTypes<16U> + { + using type = uint16_t; + }; + + template<> + struct BitMaskStorageTypes<32U> + { + using type = uint32_t; + }; + + template<> + struct BitMaskStorageTypes<64U> + { + using type = uint64_t; + }; + } // namespace detail + + /** + * @brief Number of bits in a bit mask. Most likely you want a power of two here. + */ + constexpr uint32_t const BitMaskSize = 32U; + + /** + * @brief Type to store the bit masks in. It's implemented as a template in order to facilitate changing the type + * depending on BitMaskSize. Use it with its default template argument in order to make your code agnostic of the + * number configured in BitMaskSize. (Up to providing a template implementation, of course.) + */ + template + using BitMaskStorageType = detail::BitMaskStorageTypes::type; + + /** + * @brief Represents a completely filled bit mask, i.e., all bits are one. + */ + template + static constexpr BitMaskStorageType const allOnes = std::numeric_limits>::max(); + + /** + * @brief Return the bit mask's underlying type with a single bit set (=1) at position index and all others unset + * (=0). + * + * @param index Position of the single bit set. + * @return Bit mask's underlying type with one bit set. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto singleBit(uint32_t const index) -> BitMaskStorageType + { + return BitMaskStorageType{1U} << index; + } + + /** + * @brief Return the bit mask's underlying type with all bits up to index from the right are set (=1) and all + * higher bits are unset (=0). + * + * @param index Number of set bits. + * @return Bit mask's underlying type with index bits set. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto allOnesUpTo(uint32_t const index) -> BitMaskStorageType + { + return index == 0 ? 0 : (allOnes >> (size - index)); + } + + /** + * @class BitMaskImpl + * @brief Represents a bit mask basically wrapping the BitMaskStorageType<>. + * + * This class basically provides a convenience interface to the (typically integer) type BitMaskStorageType<> for + * bit manipulations. It was originally modelled closely after std::bitset which is not necessarily available on + * device for all compilers, etc. + * + * Convention: We start counting from the right, i.e., if mask[0] == 1 and all others are 0, then mask = 0...01 + * + * CAUTION: This convention is nowhere checked and we might have an implicit assumption on the endianess here. We + * never investigated because all architectures we're interested in have the same endianess and it works on them. + * + */ + template + struct BitMaskImpl + { + BitMaskStorageType mask{}; + + /** + * @return An invalid bit index indicating the failure of a search in the bit mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreeBitFound() -> uint32_t + { + return MyBitMaskSize; + } + + /** + * @brief Look up if the index-th bit is set. + * + * @param index Bit position to check. + * @return true if bit is set else false. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto const index) -> bool + { + return (atomicLoad(acc, mask) & singleBit(index)) != BitMaskStorageType{0U}; + } + + /** + * @brief Set all bits (to 1). + * + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc) -> BitMaskStorageType + { + return alpaka::atomicOr( + acc, + &mask, + static_cast>(+allOnes)); + } + + /** + * @brief Set the index-th bit (to 1). + * + * @param index Bit position to set. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc, auto const index) + { + return alpaka::atomicOr(acc, &mask, singleBit(index)); + } + + /** + * @brief Unset the index-th bit (set it to 0). + * + * @param index Bit position to unset. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto unset(TAcc const& acc, auto const index) + { + return alpaka::atomicAnd( + acc, + &mask, + static_cast>( + allOnes ^ singleBit(index))); + } + + /** + * @brief Flip all bits in the mask. + * + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc) + { + return alpaka::atomicXor( + acc, + &mask, + static_cast>(+allOnes)); + } + + /** + * @brief Flip the index-th bits in the mask. + * + * @param index Bit position to flip. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc, auto const index) + { + return alpaka::atomicXor( + acc, + &mask, + static_cast>(singleBit(index))); + } + + /** + * @brief Compare with another mask represented by a BitMaskStorageType<>. CAUTION: This does not use atomics + * and is not thread-safe! + * + * This is not implemented thread-safe because to do so we'd need to add the accelerator as a function argument + * and that would not abide by the interface for operator==. It's intended use is to make (single-threaded) + * tests more readable, so that's not an issue. + * + * @param other Mask to compare with. + * @return true if all bits are identical else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator==(BitMaskStorageType const other) const -> bool + { + return (mask == other); + } + + /** + * @brief Spaceship operator comparing with other bit masks. CAUTION: This does not use atomics and is not + * thread-safe! See operator== for an explanation. + * + * @param other Bit mask to compare with. + * @return Positive if this mask > other mask, 0 for equality, negative otherwise. + */ + // My version of clang cannot yet handle the spaceship operator apparently: + // clang-format off + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator<=> (BitMaskImpl const other) const + // clang-format on + { + return (mask - other.mask); + } + + /** + * @brief Check if no bit is set (=1). + * + * @return true if no bit is set else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto none() const -> bool + { + return mask == 0U; + } + + /** + * @brief Interface to the main algorithm of finding a free bit. + * + * This algorithm searches for an unset bit and returns its position as an index (which is supposed to be + * translated into a corresponding chunk by the PageInterpretation). Upon doing so, it also sets this bit + * because in a multi-threaded context we cannot separate the concerns of retrieving information and acting on + * the information. It takes a start index that acts as an initial guess but (in the current implementation) it + * does not implement a strict wrapping loop as the other stages do because this would waste valuable + * information obtained from the collective operation on all bits in the mask. + * + * Additionally, it copes with partial masks by ignoring all bit positions beyond numValidBits. + * + * @param numValidBits Bit positions beyond this number will be ignored. + * @param initialGuess Initial guess for the first look up. + * @return Bit position of a free bit or noFreeBitFound() in the case of none found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( + TAcc const& acc, + uint32_t const numValidBits = MyBitMaskSize, + uint32_t const initialGuess = 0) -> uint32_t + { + return firstFreeBitWithInitialGuess(acc, initialGuess % MyBitMaskSize, numValidBits); + } + + private: + /** + * @brief Implementation of the main search algorithm. See the public firstFreeBit method for general details. + * This version assumes a valid range of the input values. + * + * @param initialGuess Initial guess for the first look up must be in the range [0;MyBitMaskSize). + * @param endIndex Maximal position to consider. Bits further out will be ignored. + * @return Bit position of a free bit or noFreeBitFound() in the case of none found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitWithInitialGuess( + TAcc const& acc, + uint32_t const initialGuess, + uint32_t const endIndex) -> uint32_t + { + auto result = noFreeBitFound(); + BitMaskStorageType oldMask = 0U; + + // This avoids a modulo that's not a power of two and is faster thereby. + auto const selectedStartBit = initialGuess >= endIndex ? 0U : initialGuess; + for(uint32_t i = selectedStartBit; i < endIndex and result == noFreeBitFound();) + { + oldMask = alpaka::atomicOr(acc, &mask, singleBit(i)); + if((oldMask & singleBit(i)) == 0U) + { + result = i; + } + + // In case of no free bit found, this will return -1. Storing it in a uint32_t will underflow and + // result in 0xffffffff but that's okay because it also ends the loop as intended. + i = alpaka::ffs(acc, static_cast>>(~oldMask)) - 1; + } + + return result; + } + }; + + using BitMask = BitMaskImpl; + + /** + * @class BitFieldFlat + * @brief Represents a (non-owning) bit field consisting of multiple bit masks. + * + * This class interprets a piece of memory as an array of bit masks and provides convenience functionality to act + * on them as a long array of bits. Most importantly, it provides an interface to find a free bit. It is a + * non-owning view of the memory! + * + * Please note, that methods usually (unless stated otherwise) refer to bits counting all bits from the start of + * the bit field, so if BitMask size is 32 and index=34=31+3, we're checking for the third bit of the second mask + * (if masks was a matrix this would be equivalent to: masks[1][2]). + * + */ + template + struct BitFieldFlatImpl + { + std::span> data; + + /** + * @brief Check if the index-th bit in the bit field is set (=1). + * + * @param index Bit position to check. + * @return true if bit is set else false. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto get(TAcc const& acc, uint32_t index) const -> bool + { + return data[index / MyBitMaskSize](acc, index % MyBitMaskSize); + } + + /** + * @brief Get the index-th mask NOT bit (counting in number of masks and not bits). + * + * @param index Position of the mask. + * @return Requested mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getMask(uint32_t const index) const -> BitMaskImpl& + { + return data[index]; + } + + /** + * @brief Set the index-th bit (to 1). + * + * @param index Position of the bit. + * @return + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void set(TAcc const& acc, uint32_t const index) const + { + data[index / MyBitMaskSize].set(acc, index % MyBitMaskSize); + } + + /** + * @brief Counterpart to set, unsetting (to 0) to index-th bit. + * + * @tparam TAcc + * @param acc + * @param index + * @return + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void unset(TAcc const& acc, uint32_t const index) const + { + data[index / MyBitMaskSize].unset(acc, index % MyBitMaskSize); + } + + /** + * @return Begin iterator to the start of the array of masks, iterating over masks NOT bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto begin() const + { + return std::begin(data); + } + + /** + * @return End iterator to the start of the array of masks, iterating over masks NOT bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto end() const + { + return std::end(data); + } + + /** + * @brief Count the number of masks. + * + * @return Number of masks. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numMasks() const + { + return data.size(); + } + + /** + * @brief Count the number of bits in the array of masks. + * + * This does not take into account if bits are valid or not, so this is always a multiple of the MyBitMaskSize + * currently. + * + * @return Number of bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBits() const + { + return numMasks() * MyBitMaskSize; + } + + /** + * @brief Main algorithm for finding and setting a free bit in the bit field. + * + * This iterates through the masks wrapping around from the given startIndex. The information of how many bits + * are valid is passed through the lower levels which automatically discard out of range results (accounting of + * partially filled masks). As always, we can't separate the concerns of retrieving information and acting on + * it in a multi-threaded context, so if a free bit is found it is immediately set. + * + * @param numValidBits Number of valid bits in the bit field (NOT masks, i.e. it's equal to numChunks() on the + * page). Should typically be a number from the range [MyBitMaskSize * (numMasks()-1) + 1, MyBitMaskSize * + * numMasks()) although other numbers shouldn't hurt. + * @param startIndex Bit mask (NOT bit) to start the search at. + * @return The index of the free bit found (and set) or noFreeBitFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( + TAcc const& acc, + uint32_t numValidBits, + uint32_t const startIndex = 0U) -> uint32_t + { + return wrappingLoop( + acc, + startIndex % numMasks(), + numMasks(), + noFreeBitFound(), + [this, numValidBits](TAcc const& localAcc, auto const index) + { + auto tmp = this->firstFreeBitAt(localAcc, numValidBits, index); + return tmp; + }); + } + + /** + * @return Special invalid bit index to indicate that no free bit was found. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBitFound() const -> uint32_t + { + return numBits(); + } + + private: + /** + * @return Position inside of a mask to start the search at. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startBitIndex() + { + return laneid(); + } + + /** + * @brief Helper function checking if we're in the last mask. + * + * @param numValidBits Number of valid bits in the bit field. The mask containing this bit is the last mask. + * @param maskIndex Index of the mask under consideration (NOT bit). + * @return true if the mask is the last mask else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isThisLastMask( + uint32_t const numValidBits, + uint32_t const maskIndex) + { + // >= in case index == numValidBits - MyBitMaskSize + return (maskIndex + 1) * MyBitMaskSize >= numValidBits; + } + + /** + * @brief Implementation of the main algorithm asking a mask of a free bit and checking if the answer is valid. + * + * @param numValidBits Number of valid bits in the bit field. + * @param maskIdx Index of the maks under consideration. + * @return Index of the free bit found IN THE BITFIELD (not only in the mask, so this value can be larger than + * MyBitMaskSize) or noFreeBitFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitAt( + TAcc const& acc, + uint32_t const numValidBits, + uint32_t const maskIdx) -> uint32_t + { + auto numValidBitsInLastMask = (numValidBits ? ((numValidBits - 1U) % MyBitMaskSize + 1U) : 0U); + auto indexInMask = getMask(maskIdx).firstFreeBit( + acc, + isThisLastMask(numValidBits, maskIdx) ? numValidBitsInLastMask : MyBitMaskSize, + startBitIndex()); + if(indexInMask < BitMaskImpl::noFreeBitFound()) + { + uint32_t freeBitIndex = indexInMask + MyBitMaskSize * maskIdx; + if(freeBitIndex < numValidBits) + { + return freeBitIndex; + } + } + return noFreeBitFound(); + } + }; + + using BitFieldFlat = BitFieldFlatImpl; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp b/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp new file mode 100644 index 0000000000..9f20c7d001 --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp @@ -0,0 +1,42 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class DataPage + * @brief Raw piece of memory of size T_pageSize + */ + template + struct DataPage + { + char data[T_pageSize]{}; + }; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp b/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp new file mode 100644 index 0000000000..db80fc9589 --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp @@ -0,0 +1,344 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include + +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class PageInterpretation + * @brief Represent our interpretation of a raw data page. + * + * This class takes a reference to a raw data page and a chunk size and provides an interface to this raw memory to + * use is as a data page filled with chunks and corresponding bit masks indicating their filling. It furthermore + * provides static helper functions that implement formulae not tied to a particular piece of memory like the + * number of chunks given a chunk sizes (and the implicit page size). + * + * @param data Raw data page reference. + * @param chunkSize Chunk sizes to interpret this memory with. + */ + template + struct PageInterpretation + { + private: + DataPage& data; + uint32_t const chunkSize; + + public: + ALPAKA_FN_INLINE ALPAKA_FN_ACC PageInterpretation(DataPage& givenData, uint32_t givenChunkSize) + : data(givenData) + , chunkSize(givenChunkSize) + { + } + + /** + * @brief Compute the number of chunks of the given size that would fit onto a page. + * + * This is not quite a trivial calculation because we have to take into account the size of the bit field at + * the end which itself depends on the number of chunks. Due to the quantisation into fixed-size bit masks we + * are in the realm of integer divisions and remainders here. + * + * This is a static version of the algorithm because there's no reference to the data at all. Convenience + * version of that uses the chunk size of an instance is provided below. + * + * @param chunkSize The chunk size to use for the calculation. + * @return Number of chunks that would fit on a page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numChunks(uint32_t const chunkSize) -> uint32_t + { + constexpr auto b = static_cast>(sizeof(BitMask)); + auto const numFull = T_pageSize / (BitMaskSize * chunkSize + b); + auto const leftOverSpace = T_pageSize - numFull * (BitMaskSize * chunkSize + b); + auto const numInRemainder = leftOverSpace > b ? (leftOverSpace - b) / chunkSize : 0U; + return numFull * BitMaskSize + numInRemainder; + } + + /** + * @brief Convenience method calling numChunks(chunkSize) with the currently set chunkSize. See there for + * details. + * + * @return Number of chunks that fit on this page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numChunks() const -> uint32_t + { + return numChunks(chunkSize); + } + + /** + * @brief Convert a chunk index into a pointer to that piece of memory. + * + * @param index Chunk index < numChunks(). + * @return Pointer to that chunk. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkPointer(uint32_t index) const -> void* + { + return reinterpret_cast(&data.data[index * chunkSize]); + } + + /** + * @brief Lightweight mangling of the hash into a start point for searching in the bit field. + * + * It is important to stress that this returns an index of a bit mask, not an individual bit's index. So, if + * the BitMaskSize is 32 and I have 64 chunks on the page, there are two bit masks and the return value is + * either 0 or 1, i.e. the search would start at the 0th or 32nd bit. + * + * @param hashValue Number providing some entropy for scattering memory accesses. + * @return Index of a bit mask to start searching at. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBitMaskIndex(uint32_t const hashValue) const + { + return (hashValue >> 16); + } + + /** + * @brief Main allocation algorithm searching a free bit in the bit mask and returning the corresponding + * pointer to a chunk. + * + * @param hashValue Number providing some entropy for scattering memory accesses. + * @return Pointer to a valid piece of memory or nullptr if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const hashValue = 0U) -> void* + { + auto field = bitField(); + auto const index = field.firstFreeBit(acc, numChunks(), startBitMaskIndex(hashValue)); + return (index < field.noFreeBitFound()) ? chunkPointer(index) : nullptr; + } + + /** + * @brief Counterpart to create, freeing an allocated pointer's memory. + * + * In production, this does not check the validity of the pointer and providing an invalid pointer is undefined + * behaviour. This includes valid pointers to outside the range of this page, obviously. + * + * @param pointer Pointer to a piece of memory created from the create method. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* pointer) -> void + { + if(chunkSize == 0) + { +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + throw std::runtime_error{ + "Attempted to destroy a pointer with chunkSize==0. Likely this page was recently " + "(and potentially pre-maturely) freed."}; +#endif // NDEBUG + return; + } + auto chunkIndex = chunkNumberOf(pointer); +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + if(not isValid(acc, chunkIndex)) + { + throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " + "to a valid chunk or it is not marked as allocated."}; + } +#endif // NDEBUG + + bitField().unset(acc, chunkIndex); + } + + /** + * @brief Convenience method to retrieve the configured minimal chunk size. + * + * @return Minimal possible chunk size of the page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto minimalChunkSize() -> uint32_t + { + return T_minimalChunkSize; + } + + /** + * @brief Clean up the full bit field region. + * + * This method is supposed to be used on raw memory and cleans up the maximal possible bit field region without + * assuming anything about its previous content. It is supposed to be used during initialisation of raw memory + * and after leaving a page in multi-page mode when arbitrary data is potentially found in that region. There + * is a further optimised version of clean-up for cases where this page was in use in chunked mode before. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupFull() -> void + { + PageInterpretation(data, minimalChunkSize()).resetBitField(); + } + + /** + * @brief Clean up previously unused parts of the bit field region. + * + * This method is supposed to have the same effect as cleanupFull but only on pages that are already in use in + * chunked mode. Due to this additional assumption we can conclude that the part that currently acts as bit + * field is already nulled (because we're the last ones on the page about to clean up, so all bits are unset). + * This significantly reduces the size of the region that needs cleaning if a small chunk size was set + * previously. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupUnused() -> void + { + auto worstCasePage = PageInterpretation(data, minimalChunkSize()); + memset( + static_cast(worstCasePage.bitFieldStart()), + 0U, + worstCasePage.bitFieldSize() - bitFieldSize()); + } + + /** + * @brief Reset the currently used bit field to 0. + * + * This was introduced to be called on pages interpreted with the minimal chunk size to fully clean up the bit + * field region. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto resetBitField() -> void + { + // This method is not thread-safe by itself. But it is supposed to be called after acquiring a "lock" in + // the form of setting the filling level, so that's fine. + + memset(static_cast(bitFieldStart()), 0U, bitFieldSize()); + } + + /** + * @brief Checks if a pointer points to an allocated chunk of memory on this page. + * + * This is not used in production and is not thread-safe in the sense that the information is stale as soon as + * it's returned. It is used in debug mode and can be used for (single-threaded) tests. + * + * @param pointer The pointer in question. + * @return true if the pointer points to an allocated chunk of memory, false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) const -> bool + { + // This function is neither thread-safe nor particularly performant. It is supposed to be used in tests and + // debug mode. + return isValid(acc, chunkNumberOf(pointer)); + } + + private: + /** + * @brief Helper method for isValid(pointer) that acts on the level of the chunk's index which translates to + * the bit field position easier than the pointer. + * + * @param chunkIndex Index to a chunk to check. + * @return true if the chunk with this index is allocated, false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, int32_t const chunkIndex) const -> bool + { + return chunkIndex >= 0 and chunkIndex < static_cast(numChunks()) and isAllocated(acc, chunkIndex); + } + + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isAllocated(TAcc const& acc, uint32_t const chunkIndex) const -> bool + { + return bitField().get(acc, chunkIndex); + } + + public: + /** + * @brief Return the bit field of this page. + * + * @return Bit field of this page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitField() const -> BitFieldFlat + { + return BitFieldFlat{{bitFieldStart(), ceilingDivision(numChunks(), BitMaskSize)}}; + } + + /** + * @brief Return a pointer to the first bit mask. + * + * @return Pointer to the first bit mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldStart() const -> BitMask* + { + return reinterpret_cast(&data.data[T_pageSize - bitFieldSize()]); + } + + /** + * @brief Convenience method to compute the bit field size of the current page. Forwards to its static version. + * See there for details. + * + * @return Size of this pages bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldSize() const -> uint32_t + { + return bitFieldSize(chunkSize); + } + + /** + * @brief Compute the size of the bit field region in number of bytes for a page with the given chunk size. + * + * There is an instance method using the instance's chunk size for convenience. + * + * @param chunkSize Chunk size of the would-be page. + * @return Size of this pages bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto bitFieldSize(uint32_t const chunkSize) -> uint32_t + { + return sizeof(BitMask) * ceilingDivision(numChunks(chunkSize), BitMaskSize); + } + + /** + * @brief Commpute the maximal possible size of the bit field in number of bytes. + * + * This is practically the bit field size of an instance with the minimaalChunkSize(). + * + * @return Maximal possible size of the bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto maxBitFieldSize() -> uint32_t + { + return PageInterpretation::bitFieldSize(minimalChunkSize()); + } + + /** + * @brief Compute a chunk index given a pointer. + * + * Please note that this will return invalid indices for invalid input pointers. Be sure to guard against this + * if you don't want to risk messing up your memory. + * + * @param pointer A pointer interpreted to be pointing to a chunk of the current page. + * @return A valid index to a chunk on this page if the pointer was valid. A potentially negative number + * outside the valid range of chunk indices otherwise. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkNumberOf(void* pointer) const -> int32_t + { + return indexOf(pointer, &data, chunkSize); + } + + // these are supposed to be temporary objects, don't start messing around with them: + PageInterpretation(PageInterpretation const&) = delete; + PageInterpretation(PageInterpretation&&) = delete; + auto operator=(PageInterpretation const&) -> PageInterpretation& = delete; + auto operator=(PageInterpretation&&) -> PageInterpretation& = delete; + ~PageInterpretation() = default; + }; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp b/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp new file mode 100644 index 0000000000..d040bc128b --- /dev/null +++ b/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp @@ -0,0 +1,73 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include + +#include + +/** + * @brief Abstraction of a short-circuiting loop that wraps around from an arbitrary starting point within the range. + * + * This implements a re-occuring pattern in the code: Due to the scattering approach taken, we're often in a position + * where we want to run a simple loop except for the fact that we start in an arbitrary position within the range and + * complete it by wrapping around to the start of the range continuing from there. Furthermore, these loops are all + * searches, so it's advantageous to implement short-circuiting by early exit in case of finding another value than the + * provided failureValue. + * + * @tparam T_size Type of size-like arguments. This function is used in various contexts where this can either be + * size_t or uint32_t. + * @tparam TFunctor Type of the function representing the loop body (typically a lambda function). + * @tparam TArgs Types of additional arguments provided to the function. + * @param startIndex Index to start the loop at. + * @param size Size of the range which equals the number of iterations to be performed in total. + * @param failureValue Return value of the function indicating a failure of the current iteration and triggering the + * next iteration. + * @param func Function of type TFunctor representing the loop body. It is supposed to return a value of + * decltype(failureValue) and indicate failure by returning the latter. Any other value is interpreted as success + * triggering early exit of the loop. + * @param args Additional arguments to be provided to the function on each iteration. + * @return The return value of func which might be failureValue in case all iterations failed. + */ +template +ALPAKA_FN_INLINE ALPAKA_FN_ACC auto wrappingLoop( + TAcc const& acc, + T_size const startIndex, + T_size const size, + auto failureValue, + TFunctor func, + TArgs... args) +{ + for(uint32_t i = 0; i < size; ++i) + { + auto result = func(acc, (i + startIndex) % size, args...); + if(result != failureValue) + { + return result; + } + } + return failureValue; +} diff --git a/include/mallocMC/creationPolicies/GallatinCuda.hpp b/include/mallocMC/creationPolicies/GallatinCuda.hpp new file mode 100644 index 0000000000..0fabc9695f --- /dev/null +++ b/include/mallocMC/creationPolicies/GallatinCuda.hpp @@ -0,0 +1,178 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +#ifdef mallocMC_HAS_Gallatin_AVAILABLE +# include +#else + +// Construct a fake, so we get a nice error message when we try to use is +// and it's not in the way when we don't. +namespace gallatin::allocators +{ + template + struct Gallatin + { + static auto generate_on_device(auto...) + { + return nullptr; + } + + template + auto malloc(T... /*unused*/) -> void* + { + // This always triggers but it depends on the template parameter, so it's only instantiated if we actually + // use it. + static_assert(sizeof...(T) < 0, "Attempt to use malloc of unavailable gallatin prototype."); + return nullptr; + } + + template + auto free(T... /*unused*/) + { + // This always triggers but it depends on the template parameter, so it's only instantiated if we actually + // use it. + static_assert(sizeof...(T) < 0, "Attempt to use free of unavailable gallatin prototype."); + } + }; +} // namespace gallatin::allocators + +#endif + +namespace mallocMC +{ + namespace CreationPolicies + { + /** + * @brief Prototype integration of Gallatin (https://dl.acm.org/doi/10.1145/3627535.3638499) + * + * This CreationPolicy integrates the CUDA code for the Gallatin prototype into mallocMC + * as a thin wrapper. Its intended for proof-of-principle tests and benchmarks only and + * obviously only works with on CUDA devices. + * + * It also only works with the reservePoolPolicies::Noop beccause it does what CudaSetLimits + * does internally on its own. + * + * If we should ever see the need for it, we'd re-implement it in alpaka for a fully-fletched + * and well-maintained version of this. + * Experience has been mixed so far: While we could reproduce good performance in some cases, + * fragmentation was found to be unusably high (to the point of single-digit utilisaton of + * available memory) in PIConGPU. That's why there's currently no plan to lift the prototype + * status in the near future. + */ + template< + typename T_AlignmentPolicy, + size_t bytes_per_segment = 16ULL * 1024 * 1024, + size_t smallest_slice = 16, + size_t largest_slice = 4096> + class GallatinCudaImpl + { + using Gallatin = gallatin::allocators::Gallatin; + + public: + template + using AlignmentAwarePolicy + = GallatinCudaImpl; + Gallatin* heap{nullptr}; + + static constexpr auto providesAvailableSlots = false; + + template + ALPAKA_FN_ACC auto create(AlpakaAcc const& /*acc*/, uint32_t bytes) const -> void* + { + return heap->malloc(static_cast(bytes)); + } + + template + ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const + { + heap->free(mem); + } + + ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool + { + return s != 0 && (p == nullptr); + } + + template + static void initHeap( + AlpakaDevice& /*dev*/, + AlpakaQueue& queue, + T_DeviceAllocator* devAllocator, + void*, + size_t memsize) + { + static_assert( + std::is_same_v, alpaka::TagGpuCudaRt>, + "The GallatinCuda creation policy is only available on CUDA architectures. Please choose a " + "different one."); + + // This is an extremely hot fix: + // PIConGPU initialises its allocator with 0 bytes to be able to distribute the pointer. + // Only afterwards it can find out its actual memory requirements and uses destructiveResize to set + // the correct heap size. Gallatin runs into issues with this approach. + // Instead, we simply don't believe the request if it's 0. + if(memsize == 0) + return; + + auto devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto tmp = Gallatin::generate_on_device(memsize, 42, true); + auto workDivSingleThread + = alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}; + alpaka::exec( + queue, + workDivSingleThread, + [tmp, devAllocator] ALPAKA_FN_ACC(AlpakaAcc const&) { devAllocator->heap = tmp; }); + } + + static auto classname() -> std::string + { + return "GallatinCuda"; + } + }; + + template< + size_t bytes_per_segment = 16ULL * 1024 * 1024, + size_t smallest_slice = 16, + size_t largest_slice = 4096> + struct GallatinCuda + { + template + using AlignmentAwarePolicy + = GallatinCudaImpl; + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/include/mallocMC/creationPolicies/OldMalloc.hpp b/include/mallocMC/creationPolicies/OldMalloc.hpp new file mode 100644 index 0000000000..9e54e43f24 --- /dev/null +++ b/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -0,0 +1,92 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "OldMalloc.hpp" + +#include + +#include + +namespace mallocMC +{ + namespace CreationPolicies + { + /** + * @brief classic malloc/free behaviour known from CUDA + * + * This CreationPolicy implements the classic device-side malloc and + * free system calls that is offered by CUDA-capable accelerator of + * compute capability 2.0 and higher + */ + class OldMalloc + { + using uint32 = std::uint32_t; + + public: + template + using AlignmentAwarePolicy = OldMalloc; + + static constexpr auto providesAvailableSlots = false; + + template + ALPAKA_FN_ACC auto create([[maybe_unused]] AlpakaAcc const& acc, uint32 bytes) const -> void* + { + return ::malloc(static_cast(bytes)); + } + + template + ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const + { + ::free(mem); + } + + ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool + { + return s != 0 && (p == nullptr); + } + + template + static void initHeap( + [[maybe_unused]] AlpakaDevice& dev, + [[maybe_unused]] AlpakaQueue& queue, + [[maybe_unused]] T_DeviceAllocator* heap, + [[maybe_unused]] void* pool, + [[maybe_unused]] size_t memsize) + { + } + + static auto classname() -> std::string + { + return "OldMalloc"; + } + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/include/mallocMC/creationPolicies/Scatter.hpp b/include/mallocMC/creationPolicies/Scatter.hpp new file mode 100644 index 0000000000..d3dfd243dd --- /dev/null +++ b/include/mallocMC/creationPolicies/Scatter.hpp @@ -0,0 +1,1422 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "../mallocMC_utils.hpp" + +#include +#include +#include + +#include +#include +#include /* uint32_t */ +#include +#include +#include +#include +#include +#include + +namespace mallocMC +{ + namespace CreationPolicies + { + namespace ScatterConf + { + struct DefaultScatterConfig + { + //! Size in byte of a page. + static constexpr auto pagesize = 4096; + /** Size in byte of an access block. + * + * Scatter alloc will keep allocations within an access block to reduce the translation lookaside + * buffer (tlb) pressure. accessblocksize can be used to optimize for the tlb of a device. + */ + static constexpr auto accessblocksize = 2u * 1024u * 1024u * 1024u; + //! Number of pages per region. + static constexpr auto regionsize = 16; + //! Factor used to calculate maximal allowed wast depending on the byte. + static constexpr auto wastefactor = 2; + /** Defines if a fully freed pages chunk size should be reset. + * + * true = Chunk size of a page will be reset if free. + * false = A page will keep the chunk size selected during the first page usage over + * the full application runtime. + */ + static constexpr auto resetfreedpages = false; + }; + + struct DefaultScatterHashingParams + { + static constexpr auto hashingK = 38183; + static constexpr auto hashingDistMP = 17497; + static constexpr auto hashingDistWP = 1; + static constexpr auto hashingDistWPRel = 1; + }; + } // namespace ScatterConf + + /** + * @brief fast memory allocation based on ScatterAlloc + * + * This CreationPolicy implements a fast memory allocator that trades + * speed for fragmentation of memory. This is based on the memory + * allocator "ScatterAlloc" + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), + * and is extended to report free memory slots of a given size (both on + * host and accelerator). To work properly, this policy class requires a + * pre-allocated heap on the accelerator and works only with Nvidia CUDA + * capable accelerators that have at least compute capability 2.0. + * + * @tparam T_Config (optional) configure the heap layout. The + * default can be obtained through Scatter<>::HeapProperties + * @tparam T_Hashing (optional) configure the parameters for + * the hashing formula. The default can be obtained through + * Scatter<>::HashingProperties + */ + template< + class T_Config = ScatterConf::DefaultScatterConfig, + class T_Hashing = ScatterConf::DefaultScatterHashingParams, + class T_AlignmentPolicy = void> + class ScatterImpl + { + public: + // TODO(lenz): This is a bit of a round trip due to a change of interface. A larger refactoring should + // remove this again. + template + using AlignmentAwarePolicy = ScatterImpl; + + using HeapProperties = T_Config; + using HashingProperties = T_Hashing; + + struct Properties + : HeapProperties + , HashingProperties + { + }; + + static constexpr auto providesAvailableSlots = true; + + private: + using uint32 = std::uint32_t; + +/** Allow for a hierarchical validation of parameters: + * + * shipped default-parameters (in the inherited struct) have lowest precedence. + * They will be overridden by a given configuration struct. However, even the + * given configuration struct can be overridden by compile-time command line + * parameters (e.g. -D MALLOCMC_CP_SCATTER_PAGESIZE 1024) + * + * default-struct < template-struct < command-line parameter + */ +#ifndef MALLOCMC_CP_SCATTER_PAGESIZE +# define MALLOCMC_CP_SCATTER_PAGESIZE (HeapProperties::pagesize) +#endif + static constexpr uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE; + +#ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE +# define MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE (HeapProperties::accessblocksize) +#endif + static constexpr size_t accessblocksize = MALLOCMC_CP_SCATTER_ACCESSBLOCKSIZE; + +#ifndef MALLOCMC_CP_SCATTER_REGIONSIZE +# define MALLOCMC_CP_SCATTER_REGIONSIZE (HeapProperties::regionsize) +#endif + static constexpr uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE; + +#ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR +# define MALLOCMC_CP_SCATTER_WASTEFACTOR (HeapProperties::wastefactor) +#endif + static constexpr uint32 wastefactor = MALLOCMC_CP_SCATTER_WASTEFACTOR; + +#ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES +# define MALLOCMC_CP_SCATTER_RESETFREEDPAGES (HeapProperties::resetfreedpages) +#endif + static constexpr bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES; + + public: + static constexpr uint32 _pagesize = pagesize; + static constexpr size_t _accessblocksize = accessblocksize; + static constexpr uint32 _regionsize = regionsize; + static constexpr uint32 _wastefactor = wastefactor; + static constexpr bool _resetfreedpages = resetfreedpages; + + private: +#if _DEBUG || ANALYSEHEAP + + public: +#endif + /* HierarchyThreshold defines the largest chunk size which can be stored in a segment with hierarchy. + * 32 chunks can be stored without an on page bitmask, therefore a hierarchy is only useful if we store at + * least 33 chunks. For 33 chunks we need two bitmasks, each 32bit. + */ + static constexpr uint32 HierarchyThreshold = (pagesize - 2u * sizeof(uint32)) / 33u; + /* Calculate minimal chunk size which can fill a page, this avoids that small allocations + * fragment the heap and increases the possibility that a small allocation can reuse an + * existing chunk. + * Each page can have 32x32 chunks. To maintain 32 chunks we need 32 bitmask on the page (each 32bit) + * + * @note: There is no requirement that minChunksSize is a power of two. + */ + static constexpr uint32 minChunkSize = (pagesize - 32u * sizeof(uint32)) / (32u * 32u); + static constexpr uint32 minSegmentSize = 32u * minChunkSize + sizeof(uint32); + // Number of possible on page masks without taking the limit of 32 masks into account. + static constexpr uint32 onPageMasks + = minChunkSize > HierarchyThreshold ? 0u : (pagesize + (minSegmentSize - 1u)) / minSegmentSize; + // The scatter malloc hierarchy design allows only 32 on page bit masks. + static constexpr uint32 maxOnPageMasks = std::min(32u, onPageMasks); + +#ifndef MALLOCMC_CP_SCATTER_HASHINGK +# define MALLOCMC_CP_SCATTER_HASHINGK (HashingProperties::hashingK) +#endif + static constexpr uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK; + +#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP +# define MALLOCMC_CP_SCATTER_HASHINGDISTMP (HashingProperties::hashingDistMP) +#endif + static constexpr uint32 hashingDistMP = MALLOCMC_CP_SCATTER_HASHINGDISTMP; + +#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP +# define MALLOCMC_CP_SCATTER_HASHINGDISTWP (HashingProperties::hashingDistWP) +#endif + static constexpr uint32 hashingDistWP = MALLOCMC_CP_SCATTER_HASHINGDISTWP; + +#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL +# define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL (HashingProperties::hashingDistWPRel) +#endif + static constexpr uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL; + + /** Page Table Entry struct + * + * The PTE holds basic information about each page + */ + struct PTE + { + uint32 chunksize; + /** Counter for how many page table entries are used. + * + * This counter is used internally as lock, to guard a full PTE the value must be set to pagesize via + * atomic CAS. + */ + uint32 count; + uint32 bitmask; + + ALPAKA_FN_ACC void init() + { + chunksize = 0; + count = 0; + bitmask = 0; + } + }; + + /** + * Page struct + * The page struct is used to access the data on the page more + * efficiently and to clear the area on the page, which might hold + * bitsfields later one + */ + struct Page + { + char data[pagesize]; + + /** + * The pages init method + * This method initializes the region on the page which might + * hold bit fields when the page is used for a small chunk size + * @param previous_chunksize the chunksize which was uses for + * the page before + */ + ALPAKA_FN_ACC void init() + { + /* Clear the entire data which can hold bitfields. + * volatile avoids that the data is changed within L1 Cache and therefore is hidden for other + * threads. + */ + uint32 volatile* write = (uint32*) (data + pagesize - (int) (sizeof(uint32) * maxOnPageMasks)); + while(write < (uint32*) (data + pagesize)) + *write++ = 0; + } + }; + + // the data used by the allocator + + volatile PTE* _ptes; + uint32 volatile* _regions; + Page* _page; + size_t _memsize; + uint32 _numpages; + uint32 _accessblocks; + uint32 _pagebasedMutex; + uint32 volatile _firstFreePageBased; + uint32 volatile _firstfreeblock; + + /** + * randInit should create an random offset which can be used + * as the initial position in a bitfield + */ + static ALPAKA_FN_ACC inline auto randInit() -> uint32 + { + // start with the laneid offset + return laneid(); + } + + /** + * randInextspot delivers the next free spot in a bitfield + * it searches for the next unset bit to the left of spot and + * returns its offset. if there are no unset bits to the left + * then it wraps around + * @param bitfield the bitfield to be searched for + * @param spot the spot from which to search to the left, range [0,spots) + * @param spots number of bits that can be used + * @return next free spot in the bitfield + */ + static ALPAKA_FN_ACC inline auto nextspot(auto const& acc, uint32 bitfield, uint32 spot, uint32 spots) + -> uint32 + { + uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); + uint32 const high_part = (bitfield << (spots - (spot + 1))); + uint32 const selection_mask = spots == sizeof(uint32) * CHAR_BIT ? ~0 : ((1u << spots) - 1); + // wrap around the bitfields from the current spot to the left + bitfield = (high_part | low_part) & selection_mask; + // compute the step from the current spot in the bitfield + uint32 const step = alpaka::ffs(acc, static_cast>(~bitfield)); + // and return the new spot + return (spot + step) % spots; + } + + /** + * onPageMasksPosition returns a pointer to the beginning of the + * onpagemasks inside a page. + * @param page the page that holds the masks + * @param the number of hierarchical page tables (bitfields) that + * are used inside this mask. + * @return pointer to the first address inside the page that holds + * metadata bitfields. + */ + ALPAKA_FN_ACC inline auto onPageMasksPosition(uint32 page, uint32 nMasks) -> uint32* + { + return (uint32*) (_page[page].data + pagesize - (int) sizeof(uint32) * nMasks); + } + + /** + * usespot marks finds one free spot in the bitfield, marks it and + * returns its offset + * @param bitfield pointer to the bitfield to use + * @param spots overall number of spots the bitfield is responsible + * for + * @return if there is a free spot it returns the spot'S offset, + * otherwise -1 + */ + template + static ALPAKA_FN_ACC inline auto usespot(AlpakaAcc const& acc, uint32* bitfield, uint32 spots) -> int + { + // get first spot + uint32 spot = randInit() % spots; + for(;;) + { + uint32 const mask = 1u << spot; + uint32 const old = alpaka::atomicOp(acc, bitfield, mask); + if((old & mask) == 0) + return spot; + // note: popc(old) == spots should be sufficient, + // but if someone corrupts the memory we end up in an + // endless loop in here... + if(alpaka::popcount(acc, old) >= static_cast(spots)) + return -1; + spot = nextspot(acc, old, spot, spots); + } + } + + /** + * calcAdditionalChunks determines the number of chunks that are + * contained in the last segment of a hierarchical page + * + * The additional checks are necessary to ensure correct results for + * very large pages and small chunksizes + * + * @param fullsegments the number of segments that can be completely + * filled in a page. This may NEVER be bigger than 32! + * @param segmentsize the number of bytes that are contained in a + * completely filled segment (32 chunks) + * @param chunksize the chosen allocation size within the page + * @return the number of additional chunks that will not fit in one + * of the fullsegments. For any correct input, this number is + * smaller than 32 + */ + template + static ALPAKA_FN_ACC inline auto calcAdditionalChunks( + AlpakaAcc const& acc, + uint32 fullsegments, + uint32 segmentsize, + uint32 chunksize) -> uint32 + { + if(fullsegments != 32) + return alpaka::math::min( + acc, + 31U, + alpaka::math::max( + acc, + 0U, + (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32)) + / chunksize); + else + return 0; + } + + /** + * addChunkHierarchy finds a free chunk on a page which uses bit + * fields on the page + * @param chunksize the chunksize of the page + * @param fullsegments the number of full segments on the page (a 32 + * bits on the page) + * @param additional_chunks the number of additional chunks in last + * segment (less than 32 bits on the page) + * @param page the page to use + * @return pointer to a free chunk on the page, 0 if we were unable + * to obtain a free chunk + */ + template + ALPAKA_FN_ACC inline auto addChunkHierarchy( + AlpakaAcc const& acc, + uint32 chunksize, + uint32 fullsegments, + uint32 additional_chunks, + uint32 page) -> void* + { + uint32 const segments = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32 spot = randInit() % segments; + uint32 const mask = _ptes[page].bitmask; + if((mask & (1u << spot)) != 0) + spot = nextspot(acc, mask, spot, segments); + uint32 const tries = segments - alpaka::popcount(acc, mask); + uint32* onpagemasks = onPageMasksPosition(page, segments); + for(uint32 i = 0; i < tries; ++i) + { + int const hspot = usespot(acc, &onpagemasks[spot], spot < fullsegments ? 32 : additional_chunks); + if(hspot != -1) + return _page[page].data + (32 * spot + hspot) * chunksize; + alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, 1u << spot); + spot = nextspot(acc, mask, spot, segments); + } + return 0; + } + + /** + * addChunkNoHierarchy finds a free chunk on a page which uses the + * bit fields of the pte only + * @param chunksize the chunksize of the page + * @param page the page to use + * @param spots the number of chunks which fit on the page + * @return pointer to a free chunk on the page, 0 if we were unable + * to obtain a free chunk + */ + template + ALPAKA_FN_ACC inline auto addChunkNoHierarchy( + AlpakaAcc const& acc, + uint32 chunksize, + uint32 page, + uint32 spots) -> void* + { + int const spot = usespot(acc, (uint32*) &_ptes[page].bitmask, spots); + if(spot == -1) + return 0; // that should be impossible :) + return _page[page].data + spot * chunksize; + } + + /** + * tryUsePage tries to use the page for the allocation request + * @param page the page to use + * @param chunksize the chunksize of the page + * @param isChunkSizeInRange functor to validate if a given chunk size can be used even if the size is + * different to the parameter chunksize. Required interface: `bool operator()(uint32_t)` returning true if + * range is valid else false + * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk + */ + template + ALPAKA_FN_ACC inline auto tryUsePage( + AlpakaAcc const& acc, + uint32 page, + uint32 chunksize, + T_ChunkSizeRangeCheck&& isChunkSizeInRange) -> void* + { + void* chunk_ptr = nullptr; + + // increse the fill level + uint32 const filllevel = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); + + // if resetfreedpages == false we do not need to re-check chunksize + bool tryAllocMem = !resetfreedpages; + + if(filllevel < pagesize) + { + if constexpr(resetfreedpages) + { + /* Re-check chunk size (it could be that the page got freed in the meanwhile...) + * Use atomic to guarantee that no other thread deleted the page and reinitialized + * it with another chunk size. + * + * In case the page is now free (chunksize == 0) we acquire the new chunk size. + * In cases where the page has already a chunksize we test if the chunksize fits our needs. + */ + uint32 const oldChunksize = alpaka::atomicOp( + acc, + (uint32*) &_ptes[page].chunksize, + 0u, + chunksize); + if(oldChunksize == 0u || isChunkSizeInRange(oldChunksize)) + tryAllocMem = true; + // update the chunk size used for the allocation if the PTE was not empty before. + if(oldChunksize != 0) + chunksize = oldChunksize; + } + } + else + { + // note: if filllevel >= pagesize then page is currently freed by another thread + tryAllocMem = false; + } + + if(tryAllocMem) + { + if(chunksize <= HierarchyThreshold) + { + // more chunks than can be covered by the pte's single + // bitfield can be used + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); + uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); + uint32 const additional_chunks + = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); + if(filllevel < fullsegments * 32 + additional_chunks) + chunk_ptr = addChunkHierarchy(acc, chunksize, fullsegments, additional_chunks, page); + } + else + { + uint32 const chunksinpage = alpaka::math::min(acc, pagesize / chunksize, 32u); + if(filllevel < chunksinpage) + chunk_ptr = addChunkNoHierarchy(acc, chunksize, page, chunksinpage); + } + } + + // this one is full or not useable + if(chunk_ptr == nullptr) + { + uint32_t oldFillLevel + = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); + if(oldFillLevel == 1u) + { + // chunksize guaranteed to hold the chunksize + tryCleanPage(acc, page); + } + } + + return chunk_ptr; + } + + /** + * allocChunked tries to allocate the demanded number of bytes on + * one of the pages + * @param bytes the number of bytes to allocate, must be <=pagesize + * @return pointer to a free chunk on a page, 0 if we were unable to + * obtain a free chunk + */ + template + ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* + { + // use the minimal allocation size to increase the hit rate for small allocations. + uint32 const paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); + uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); + uint32 const numpages = _numpages; + uint32 const pagesperblock = numpages / _accessblocks; + uint32 const reloff = warpSize * minAllocation / pagesize; + uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) + + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc)) + % pagesperblock; + uint32 const maxchunksize = alpaka::math::min( + acc, + +pagesize, + /* this clumping means that allocations of paddedMinChunkSize could have a waste exceeding the + * wastefactor + */ + alpaka::math::max(acc, wastefactor * bytes, paddedMinChunkSize)); + + /* global page index + * - different for each thread to reduce memory read/write conflicts + * - index calculated by the hash function + */ + uint32 const global_start_page = start_page_in_block + _firstfreeblock * pagesperblock; + + uint32 checklevel = regionsize * 3 / 4; + /* Finding a free segment is using a two step approach. + * In both cases each thread will start on a different region and page based on the hash function + * result, this scatters the memory access and reduces access conflicts. Both steps will in the worst + * case iterate over all heap access blocks and pages. + * - step I search for a region which is only filled 3/4 + * - if a free segment is found return + * - step II goto any region independent of the fill level + * - if a free segment is found return + */ + for(uint32 finder = 0; finder < 2; ++finder) + { + uint32 global_page = global_start_page; + /* Loop over all pages until we found a free one or arrived to global_start_page again + * This and the following loop are done as do-while to potentially save registers by avoiding an + * extra loop counter variable + */ + do + { + uint32 const region = global_page / regionsize; + uint32 const regionfilllevel = _regions[region]; + uint32 const region_offset = region * regionsize; + if(regionfilllevel < checklevel) + { + uint32 page_in_region = global_page; + // loop over pages within a region + do + { + // Set the chunk size to our needs. If the old chunk size is not zero we check if we + // can still use the chunk even if memory is waisted. + uint32 beforeChunkSize = alpaka::atomicOp( + acc, + (uint32*) &_ptes[page_in_region].chunksize, + 0u, + minAllocation); + // Check if the chunk size can be used even if the size is not an exact match. + auto const isChunkSizeInRange = [&](uint32_t currentChunkSize) + { return currentChunkSize >= bytes && currentChunkSize <= maxchunksize; }; + uint32_t useChunkSize = 0u; + if(beforeChunkSize == 0u) + { + useChunkSize = minAllocation; + } + else if(isChunkSizeInRange(beforeChunkSize)) + { + // someone else acquired the page, but we can also use it + useChunkSize = beforeChunkSize; + } + if(useChunkSize != 0u) + { + void* res = tryUsePage(acc, page_in_region, useChunkSize, isChunkSizeInRange); + if(res != nullptr) + return res; + } + page_in_region = region_offset + ((page_in_region + 1) % regionsize); + } while(page_in_region != global_page); + + // could not alloc in region, tell that + if(regionfilllevel + 1 <= regionsize) + alpaka::atomicOp( + acc, + (uint32*) (_regions + region), + regionfilllevel, + regionfilllevel + 1); + } + // goto next region + global_page = (global_page + regionsize) % numpages; + // check if we jumped into the next access block + if(global_page % pagesperblock == 0u) + { + uint32 const access_block_id = global_page / pagesperblock; + // randomize the thread writing the info + // Data races are not critical. + if(access_block_id > _firstfreeblock) + _firstfreeblock = access_block_id; + } + + } while(global_page != global_start_page); + + // we are really full :/ so lets search every page for a segment! + checklevel = regionsize + 1; + } + return nullptr; + } + + /** tries to clean-up the page + * + * The last thread reducing the page count to zero should call this method. + */ + template + ALPAKA_FN_ACC void tryCleanPage(AlpakaAcc const& acc, uint32 page) + { + if constexpr(resetfreedpages) + { + /* Workaround for nvcc because the in class defined static constexpr variable can not be passed + * into functions taking a constant reference. + */ + constexpr auto pageSize = pagesize; + /* Try lock the PTE to cleanup the meta data. + * Only the last allocation within the PTE will be successfully lock the PTE. + * In case it is the last allocation on the page the new pagesize will signal full and nobody else + * is allowed to touch the meta data anymore. + */ + auto oldfilllevel + = alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, 0u, pageSize); + + if(oldfilllevel == 0) + { + uint32 const chunksize + = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); + // if chunksize == 0 than another thread cleaned the page already + if(chunksize != 0) + { + // clean meta data bits on the PTE + _page[page].init(); + + /** Take care that the meta data changes where we did not use atomics are propagated to all + * other threads. + */ + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + /* Remove chunk information. + * It is important that this call happened after page init is called because scatter malloc + * is updating the chunksize without notify the action by increasing the page count + * beforehand. + */ + auto oldChunkSize = alpaka::atomicOp( + acc, + (uint32*) &_ptes[page].chunksize, + chunksize, + 0u); + + // CAUTION: This printf never fires but it is of utmost importance! It's existence has a + // similar effect as the mem_fence in the FlatterScatter AccessBlock at this position. + // Using the result of the atomic above implies that it has actually been executed and + // observed by other threads. The otherwise unconditional release of the filling-level lock + // cannot be observed before resetting the chunk size only due to this `if` block. + if(oldChunkSize != chunksize) + { + // The chunksize can only be changed if it was in between zero. Therefore this code + // should never be reached or we started this method with an outdated chunksize. + printf( + "%u != %u, %u unexpected behaviour during dealloction\n", + oldChunkSize, + chunksize, + page); + } + } + /* Unlock the PTE by reducing the counter. + * In case another allocation is at the same moment trying to allocate memory in tryUsePage() + * the counter can be larger then zero after this dealloc is reducing the counter, this is no + * problem because if the chunk size in tryUsaPage() is not fitting the counter is reduced an + * the page is marked as free. + */ + alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, pageSize); + } + } + } + + /** + * deallocChunked frees the chunk on the page and updates all data + * accordingly + * @param mem pointer to the chunk + * @param page the page the chunk is on + * @param chunksize the chunksize used for the page + */ + template + ALPAKA_FN_ACC void deallocChunked(AlpakaAcc const& acc, void* mem, uint32 page, uint32 chunksize) + { + auto const inpage_offset = static_cast((char*) mem - _page[page].data); + if(chunksize <= HierarchyThreshold) + { + // one more level in hierarchy + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); + uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); + uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); + uint32 const segment = inpage_offset / (chunksize * 32); + uint32 const withinsegment = (inpage_offset - segment * (chunksize * 32)) / chunksize; + // mark it as free + uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32* onpagemasks = onPageMasksPosition(page, nMasks); + /* currently unchecked: + * uint32 old = */ + alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); + + // always do this, since it might fail due to a + // race-condition with addChunkHierarchy + alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment)); + } + else + { + uint32 const segment = inpage_offset / chunksize; + alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment)); + } + + uint32 oldfilllevel = alpaka::atomicOp(acc, (uint32*) &_ptes[page].count, 1u); + + if(oldfilllevel == 1u) + tryCleanPage(acc, page); + + // meta information counters ... should not be changed by too + // many threads, so.. + if(oldfilllevel == pagesize / 2 / chunksize) + { + uint32 const region = page / regionsize; + alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); + uint32 const pagesperblock = _numpages / _accessblocks; + uint32 const block = page / pagesperblock; + if(warpid(acc) + laneid() == 0) + alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); + } + } + + /** + * markpages markes a fixed number of pages as used + * @param startpage first page to mark + * @param pages number of pages to mark + * @param bytes number of overall bytes to mark pages for + * @return true on success, false if one of the pages is not free + */ + template + ALPAKA_FN_ACC auto markpages(AlpakaAcc const& acc, uint32 startpage, uint32 pages, uint32 bytes) -> bool + { + uint32 abord = std::numeric_limits::max(); + for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage) + { + uint32 const old + = alpaka::atomicOp(acc, (uint32*) &_ptes[trypage].chunksize, 0u, bytes); + if(old != 0) + { + abord = trypage; + break; + } + } + if(abord == std::numeric_limits::max()) + return true; + for(uint32 trypage = startpage; trypage < abord; ++trypage) + alpaka::atomicOp(acc, (uint32*) &_ptes[trypage].chunksize, bytes, 0u); + return false; + } + + /** + * allocPageBasedSingleRegion tries to allocate the demanded number + * of bytes on a continues sequence of pages + * @param startpage first page to be used + * @param endpage last page to be used + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + */ + template + ALPAKA_FN_ACC auto allocPageBasedSingleRegion( + AlpakaAcc const& acc, + uint32 startpage, + uint32 endpage, + uint32 bytes) -> void* + { + uint32 const pagestoalloc = ceilingDivision(bytes, pagesize); + uint32 freecount = 0; + bool left_free = false; + for(uint32 search_page = startpage + 1; search_page > endpage;) + { + --search_page; + if(_ptes[search_page].chunksize == 0) + { + if(++freecount == pagestoalloc) + { + // try filling it up + if(markpages(acc, search_page, pagestoalloc, bytes)) + { + // mark that we filled up everything up to here + if(!left_free) + alpaka::atomicOp( + acc, + (uint32*) &_firstFreePageBased, + startpage, + search_page - 1); + return _page[search_page].data; + } + } + } + else + { + left_free = true; + freecount = 0; + } + } + return 0; + } + + /** + * allocPageBasedSingle tries to allocate the demanded number of + * bytes on a continues sequence of pages + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + * @pre only a single thread of a warp is allowed to call the + * function concurrently + */ + template + ALPAKA_FN_ACC auto allocPageBasedSingle(AlpakaAcc const& acc, uint32 bytes) -> void* + { + // acquire mutex + while(alpaka::atomicOp(acc, &_pagebasedMutex, 1u) != 0) + ; + // search for free spot from the back + uint32 const spage = _firstFreePageBased; + void* res = allocPageBasedSingleRegion(acc, spage, 0, bytes); + if(res == 0) + // also check the rest of the pages + res = allocPageBasedSingleRegion(acc, _numpages, spage, bytes); + + // free mutex + alpaka::atomicOp(acc, &_pagebasedMutex, 0u); + return res; + } + + /** + * allocPageBased tries to allocate the demanded number of bytes on + * a continues sequence of pages + * @param bytes number of overall bytes to mark pages for + * @return pointer to the first page to use, 0 if we were unable to + * use all the requested pages + */ + template + ALPAKA_FN_ACC auto allocPageBased(AlpakaAcc const& acc, uint32 bytes) -> void* + { + // this is rather slow, but we dont expect that to happen often + // anyway + + // only one thread per warp can acquire the mutex + void* res = 0; + // based on the alpaka backend the lanemask type can be 64bit + auto const mask = alpaka::warp::activemask(acc); + uint32_t const num = alpaka::popcount(acc, mask); + // based on the alpaka backend the lanemask type can be 64bit + auto const lanemask = lanemask_lt(acc); + uint32_t const local_id = alpaka::popcount(acc, lanemask & mask); + for(unsigned int active = 0; active < num; ++active) + if(active == local_id) + res = allocPageBasedSingle(acc, bytes); + return res; + } + + /** + * deallocPageBased frees the memory placed on a sequence of pages + * @param mem pointer to the first page + * @param page the first page + * @param bytes the number of bytes to be freed + */ + template + ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, uint32 page, uint32 bytes) + { + uint32 const pages = ceilingDivision(bytes, pagesize); + for(uint32 p = page; p < page + pages; ++p) + _page[p].init(); + + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + + for(uint32 p = page; p < page + pages; ++p) + alpaka::atomicOp(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u); + alpaka::atomicOp(acc, (uint32*) &_firstFreePageBased, page + pages - 1); + } + + public: + /** + * create allocates the requested number of bytes via the heap. + * Coalescing has to be done before by another policy. + * @param bytes number of bytes to allocate + * @return pointer to the allocated memory + */ + template + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* + { + if(bytes == 0) + return 0; + /* Take care of padding + * bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); + * in alignment-policy. + * bytes == pagesize must be handled by allocChunked() else maxchunksize calculation based + * on the waste factor is colliding with the allocation schema in allocPageBased(). + */ + if(bytes <= pagesize) + // chunck based + return allocChunked(acc, bytes); + else + // allocate a range of pages + return allocPageBased(acc, bytes); + } + + /** + * destroy frees the memory regions previously acllocted via create + * @param mempointer to the memory region to free + */ + template + ALPAKA_FN_ACC void destroy(AlpakaAcc const& acc, void* mem) + { + if(mem == 0) + return; + + // CAUTION: This memfence is of utmost importance! As we are allowing a re-use of the chunk we're about + // to free, we need to make sure that any memory operation from the previous thread is executed before + // we can safely consider it free. If this is missing, an extended (non-atomic) write operation might + // not yet have finished when we unset the bit. In such a case, another thread might start using the + // memory while we're still writing to it, thus corrupting the new thread's data. It might even lead to + // us overwriting the bitmask itself, if the chunk size (and thereby the extent of the bitmask) changes + // before we finish. (The latter scenario might be excluded by other mem_fences in the code.) If a read + // is pending, the old thread might read data from the new thread leading to inconsistent information + // in the first thread. + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + + // lets see on which page we are on + auto const page = static_cast(((char*) mem - (char*) _page) / pagesize); + /* Emulate atomic read. + * In older implementations we read the chunksize without atomics which can result in data races. + */ + uint32 const chunksize + = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); + + // is the pointer the beginning of a chunk? + auto const inpage_offset = static_cast((char*) mem - _page[page].data); + uint32 const block = inpage_offset / chunksize; + uint32 const inblockoffset = inpage_offset - block * chunksize; + if(inblockoffset != 0) + { + uint32* counter = (uint32*) (_page[page].data + block * chunksize); + // coalesced mem free + + uint32 const old = alpaka::atomicOp(acc, counter, 1u); + if(old != 1) + return; + mem = (void*) counter; + } + + if(chunksize <= pagesize) + deallocChunked(acc, mem, page, chunksize); + else + deallocPageBased(acc, page, chunksize); + } + + /** + * init inits the heap data structures + * the init method must be called before the heap can be used. the + * method can be called with an arbitrary number of threads, which + * will increase the inits efficiency + * @param memory pointer to the memory used for the heap + * @param memsize size of the memory in bytes + */ + template + ALPAKA_FN_ACC void initDeviceFunction(AlpakaAcc const& acc, void* memory, size_t memsize) + { + auto const linid = alpaka::getIdx(acc).sum(); + auto const totalThreads = alpaka::getWorkDiv(acc).prod(); + + uint32 numregions = ((unsigned long long) memsize) + / (((unsigned long long) regionsize) * (sizeof(PTE) + pagesize) + sizeof(uint32)); + + uint32 numpages = numregions * regionsize; + // pointer is copied (copy is called page) + Page* page = (Page*) memory; + + // We have to calculate these values here, before using them for other things. + // First calculate how many blocks of the given size fit our memory pages in principle. + // However, we do not have to use the exact requested block size. + // So we redistribute actual memory between the chosen number of blocks + // and ensure that all blocks have the same number of regions. + auto const memorysize = static_cast(numpages) * pagesize; + auto const numblocks = memorysize / accessblocksize; + auto const memoryperblock = memorysize / numblocks; + auto const pagesperblock = memoryperblock / pagesize; + auto const regionsperblock = pagesperblock / regionsize; + numregions = numblocks * regionsperblock; + numpages = numregions * regionsize; + + PTE* ptes = (PTE*) (page + numpages); + uint32* regions = (uint32*) (ptes + numpages); + // sec check for mem size + // this check refers to the original memory-pointer, which was + // not adjusted! + if((char*) (regions + numregions) > (((char*) memory) + memsize)) + { + --numregions; + numpages = alpaka::math::min(acc, numregions * regionsize, numpages); + if(linid == 0) + printf("c Heap Warning: needed to reduce number of " + "regions to stay within memory limit\n"); + } + // Recalculate since numpages could have changed + ptes = (PTE*) (page + numpages); + regions = (uint32*) (ptes + numpages); + + for(uint32 i = linid; i < numpages; i += totalThreads) + { + ptes[i].init(); + page[i].init(); + } + for(uint32 i = linid; i < numregions; i += totalThreads) + regions[i] = 0; + + if(linid == 0) + { + _memsize = memsize; + _numpages = numpages; + _accessblocks = numblocks; + _ptes = (volatile PTE*) ptes; + _page = page; + _regions = regions; + _firstfreeblock = 0; + _pagebasedMutex = 0; + _firstFreePageBased = numpages - 1; + + if((char*) &_page[numpages] > (char*) memory + memsize) + printf("error in heap alloc: numpages too high\n"); + } + } + + static ALPAKA_FN_ACC auto isOOM(void* p, size_t s) -> bool + { + // one thread that requested memory returned null + return s && (p == nullptr); + } + + template + static void initHeap( + AlpakaDevice& dev, + AlpakaQueue& queue, + T_DeviceAllocator* heap, + void* pool, + size_t memsize) + { + if(pool == nullptr && memsize != 0) + { + throw std::invalid_argument("Scatter policy cannot use nullptr for non-empty " + "memory pools. " + "Maybe you are using an incompatible ReservePoolPolicy " + "or AlignmentPolicy."); + } + auto initKernel = [] ALPAKA_FN_ACC( + AlpakaAcc const& m_acc, + T_DeviceAllocator* m_heap, + void* m_heapmem, + size_t m_memsize) { m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); }; + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto threadsPerBlock = VecType::ones(); + + auto const devProps = alpaka::getAccDevProps(dev); + + threadsPerBlock[Dim::value - 1] + = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); + + auto const workDiv = alpaka::WorkDivMembers{ + VecType::ones(), + threadsPerBlock, + VecType::ones()}; // Dim may be any dimension, but workDiv is 1D + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, initKernel, heap, pool, memsize)); + } + + /** counts how many elements of a size fit inside a given page + * + * Examines a (potentially already used) page to find how many + * elements of size chunksize still fit on the page. This includes + * hierarchically organized pages and empty pages. The algorithm + * determines the number of chunks in the page in a manner similar + * to the allocation algorithm of CreationPolicies::Scatter. + * + * @param page the number of the page to examine. The page needs to + * be formatted with a chunksize and potentially a hierarchy. + * @param chunksize the size of element that should be placed inside + * the page. This size must be appropriate to the formatting of the + * page. + */ + template + ALPAKA_FN_ACC auto countFreeChunksInPage(AlpakaAcc const& acc, uint32 page, uint32 chunksize) -> unsigned + { + uint32 const filledChunks = _ptes[page].count; + if(chunksize <= HierarchyThreshold) + { + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 + // 2nd-level chunks + uint32 const fullsegments = alpaka::math::min( + acc, + 32u, + pagesize / segmentsize); // there might be space for + // more than 32 segments + // with 32 2nd-level chunks + uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); + uint32 const level2Chunks = fullsegments * 32 + additional_chunks; + return level2Chunks - filledChunks; + } + else + { + uint32 const chunksinpage = alpaka::math::min( + acc, + pagesize / chunksize, + 32u); // without hierarchy, there can not be more than + // 32 chunks + return chunksinpage - filledChunks; + } + } + + /** counts the number of available slots inside the heap + * + * Searches the heap for all possible locations of an element with + * size slotSize. The used traversal algorithms are similar to the + * allocation strategy of CreationPolicies::Scatter, to ensure + * comparable results. There are 3 different algorithms, based on + * the size of the requested slot: 1 slot spans over multiple pages, + * 1 slot fits in one chunk within a page, 1 slot fits in a fraction + * of a chunk. + * + * @param slotSize the amount of bytes that a single slot accounts + * for + * @param gid the id of the thread. this id does not have to + * correspond with threadId.x, but there must be a continous range + * @param stride the stride should be equal to the number of + * different gids (and therefore of value max(gid)-1) + */ + template + ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction( + AlpakaAcc const& acc, + size_t slotSize, + uint32 gid, + uint32 stride) -> unsigned + { + unsigned slotcount = 0; + if(slotSize < pagesize) + { // multiple slots per page + for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride) + { + uint32 const maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * (uint32) slotSize); + + uint32 chunksize = _ptes[currentpage].chunksize; + if(chunksize >= slotSize && chunksize <= maxchunksize) + { // how many chunks left? (each chunk is big enough) + slotcount += countFreeChunksInPage(acc, currentpage, chunksize); + } + else if(chunksize == 0) + { + chunksize = alpaka::math::max( + acc, + (uint32) slotSize, + T_AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size + slotcount += countFreeChunksInPage( + acc, + currentpage, + chunksize); // how many chunks fit in one page? + } + else + { + continue; // the chunks on this page are too small + // for the request :( + } + } + } + else + { // 1 slot needs multiple pages + if(gid > 0) + return 0; // do this serially + uint32 const pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); + uint32 freecount = 0; + for(uint32 currentpage = _numpages; currentpage > 0;) + { // this already includes all superblocks + --currentpage; + if(_ptes[currentpage].chunksize == 0) + { + if(++freecount == pagestoalloc) + { + freecount = 0; + ++slotcount; + } + } + else + { // the sequence of free pages was interrupted + freecount = 0; + } + } + } + return slotcount; + } + + /** Count, how many elements can be allocated at maximum + * + * Takes an input size and determines, how many elements of this + * size can be allocated with the CreationPolicy Scatter. This will + * return the maximum number of free slots of the indicated size. It + * is not guaranteed where these slots are (regarding + * fragmentation). Therefore, the practically usable number of slots + * might be smaller. This function is executed in parallel. Speedup + * can possibly increased by a higher amount ofparallel workers. + * + * @param slotSize the size of allocatable elements to count + * @param obj a reference to the allocator instance (host-side) + */ + + public: + template + static auto getAvailableSlotsHost( + AlpakaDevice& dev, + AlpakaQueue& queue, + size_t const slotSize, + T_DeviceAllocator* heap) -> unsigned + { + auto d_slots = alpaka::allocBuf(dev, 1); + alpaka::memset(queue, d_slots, 0, 1); + + auto getAvailableSlotsKernel = [] ALPAKA_FN_ACC( + AlpakaAcc const& acc, + T_DeviceAllocator* heapPtr, + size_t numBytes, + unsigned* slots) -> void + { + auto const gid = alpaka::getIdx(acc).sum(); + + auto const nWorker = alpaka::getWorkDiv(acc).prod(); + unsigned const temp + = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker); + if(temp) + alpaka::atomicOp(acc, slots, temp); + }; + + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + + using VecType = alpaka::Vec; + + auto numBlocks = VecType::ones(); + numBlocks[Dim::value - 1] = 64u; + auto threadsPerBlock = VecType::ones(); + + auto const devProps = alpaka::getAccDevProps(dev); + + threadsPerBlock[Dim::value - 1] + = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); + + auto const workDiv = alpaka::WorkDivMembers{ + numBlocks, + threadsPerBlock, + VecType::ones()}; // Dim may be any dimension, but workDiv is 1D + + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDiv, + getAvailableSlotsKernel, + heap, + slotSize, + alpaka::getPtrNative(d_slots))); + + auto const platform = alpaka::Platform{}; + auto const hostDev = alpaka::getDevByIdx(platform, 0); + + auto h_slots = alpaka::allocBuf(hostDev, 1); + alpaka::memcpy(queue, h_slots, d_slots, 1); + alpaka::wait(queue); + + return *alpaka::getPtrNative(h_slots); + } + + /** Count, how many elements can be allocated at maximum + * + * Takes an input size and determines, how many elements of this + * size can be allocated with the CreationPolicy Scatter. This will + * return the maximum number of free slots of the indicated size. It + * is not guaranteed where these slots are (regarding + * fragmentation). Therefore, the practically usable number of slots + * might be smaller. This function is executed separately for each + * warp and does not cooperate with other warps. Maximum speed is + * expected if every thread in the warp executes the function. Uses + * 256 byte of shared memory. + * + * @param slotSize the size of allocatable elements to count + */ + template + ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned + { + int const wId = warpid_withinblock(acc); // do not use warpid-function, since + // this value is not guaranteed to + // be stable across warp lifetime + + uint32 const activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); + + constexpr auto warpsize = warpSize; + auto& activePerWarp = alpaka::declareSharedVar< + std::uint32_t[maxThreadsPerBlock / warpsize], + __COUNTER__>(acc); // maximum number of warps in a block + + auto& warpResults + = alpaka::declareSharedVar], __COUNTER__>(acc); + + warpResults[wId] = 0; + activePerWarp[wId] = 0; + + // wait that all shared memory is initialized + alpaka::syncBlockThreads(acc); + + // the active threads obtain an id from 0 to activeThreads-1 + if(slotSize == 0) + return 0; + auto const linearId = alpaka::atomicOp(acc, &activePerWarp[wId], 1u); + + // printf("Block %d, id %d: activeThreads=%d + // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); + unsigned const temp + = this->template getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads); + if(temp) + alpaka::atomicOp(acc, &warpResults[wId], temp); + + alpaka::syncBlockThreads(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Block{}); + + return warpResults[wId]; + } + + static auto classname() -> std::string + { + std::stringstream ss; + ss << "Scatter["; + ss << "pagesize=" << pagesize << ","; + ss << "accessblocksize=" << accessblocksize << ","; + ss << "regionsize=" << regionsize << ","; + ss << "wastefactor=" << wastefactor << ","; + ss << "resetfreedpages=" << resetfreedpages << ","; + ss << "minChunkSize=" << minChunkSize << ","; + ss << "HierarchyThreshold=" << HierarchyThreshold << ","; + ss << "hashingK=" << hashingK << ","; + ss << "hashingDistMP=" << hashingDistMP << ","; + ss << "hashingDistWP=" << hashingDistWP << ","; + ss << "hashingDistWPRel=" << hashingDistWPRel << "]"; + return ss.str(); + } + }; + + template + struct Scatter + { + template + using AlignmentAwarePolicy = ScatterImpl; + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/include/mallocMC/device_allocator.hpp b/include/mallocMC/device_allocator.hpp new file mode 100644 index 0000000000..0f6fe090d0 --- /dev/null +++ b/include/mallocMC/device_allocator.hpp @@ -0,0 +1,122 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian J. Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC_traits.hpp" + +#include + +#include +#include + +namespace mallocMC +{ + /** + * @brief "HostClass" that combines all policies to a useful allocator + * + * This class implements the necessary glue-logic to form an actual + * allocator from the provided policies. It implements the public interface + * and executes some constraint checking based on an instance of the class + * PolicyConstraints. + * + * @tparam T_CreationPolicy The desired type of a CreationPolicy + * @tparam T_DistributionPolicy The desired type of a DistributionPolicy + * @tparam T_OOMPolicy The desired type of a OOMPolicy + * @tparam T_ReservePoolPolicy The desired type of a ReservePoolPolicy + * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy + */ + template< + typename T_CreationPolicy, + typename T_DistributionPolicy, + typename T_OOMPolicy, + typename T_AlignmentPolicy> + class DeviceAllocator : public T_CreationPolicy::template AlignmentAwarePolicy + { + using uint32 = std::uint32_t; + + public: + using CreationPolicy = T_CreationPolicy; + using DistributionPolicy = T_DistributionPolicy; + using OOMPolicy = T_OOMPolicy; + using AlignmentPolicy = T_AlignmentPolicy; + + template + ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void* + { + if(bytes == 0U) + { + return nullptr; + } + bytes = AlignmentPolicy::applyPadding(bytes); + DistributionPolicy distributionPolicy(acc); + uint32 const req_size = distributionPolicy.collect(acc, bytes); + void* memBlock = CreationPolicy::template AlignmentAwarePolicy::create(acc, req_size); + if(CreationPolicy::isOOM(memBlock, req_size)) + { + memBlock = OOMPolicy::handleOOM(memBlock); + } + return distributionPolicy.distribute(acc, memBlock); + } + + template + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* pointer) + { + if(pointer != nullptr) + { + CreationPolicy::template AlignmentAwarePolicy::destroy(acc, pointer); + } + } + + /** Provide the number of available free slots. + * + * @tparam AlpakaAcc The type of the Allocator to be used + * @param acc alpaka accelerator + * @param slotSize assumed allocation size in bytes + * @return number of free slots of the given size, if creation policy is not providing the information on the + * device side 0 will be returned. + */ + template + ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned + { + slotSize = AlignmentPolicy::applyPadding(slotSize); + if constexpr(Traits::providesAvailableSlots) + { + return CreationPolicy::template AlignmentAwarePolicy::getAvailableSlotsAccelerator( + acc, + slotSize); + } + else + { + return 0U; + } + } + }; + +} // namespace mallocMC diff --git a/include/mallocMC/distributionPolicies/Noop.hpp b/include/mallocMC/distributionPolicies/Noop.hpp new file mode 100644 index 0000000000..98b2968e4d --- /dev/null +++ b/include/mallocMC/distributionPolicies/Noop.hpp @@ -0,0 +1,77 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "Noop.hpp" + +#include + +#include +#include + +namespace mallocMC +{ + namespace DistributionPolicies + { + /** + * @brief a policy that does nothing + * + * This DistributionPolicy will not perform any distribution, but only + * return its input (identity function) + */ + class Noop + { + using uint32 = std::uint32_t; + + public: + template + ALPAKA_FN_ACC Noop(AlpakaAcc const& /*acc*/) + { + } + + template + ALPAKA_FN_ACC auto collect(AlpakaAcc const& /*acc*/, uint32 bytes) const -> uint32 + { + return bytes; + } + + template + ALPAKA_FN_ACC auto distribute(AlpakaAcc const& /*acc*/, void* allocatedMem) const -> void* + { + return allocatedMem; + } + + static auto classname() -> std::string + { + return "Noop"; + } + }; + + } // namespace DistributionPolicies +} // namespace mallocMC diff --git a/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/include/mallocMC/distributionPolicies/XMallocSIMD.hpp new file mode 100644 index 0000000000..cd8a30eb0e --- /dev/null +++ b/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -0,0 +1,194 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "../mallocMC_utils.hpp" +#include "XMallocSIMD.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace mallocMC +{ + namespace DistributionPolicies + { + namespace XMallocSIMDConf + { + struct DefaultXMallocConfig + { + static constexpr auto pagesize = 4096; + }; + } // namespace XMallocSIMDConf + + /** + * @brief SIMD optimized chunk resizing in the style of XMalloc + * + * This DistributionPolicy can take the memory requests from a group of + * worker threads and combine them, so that only one of the workers will + * allocate the whole request. Later, each worker gets an appropriate + * offset into the allocated chunk. This is beneficial for SIMD + * architectures since only one of the workers has to compete for the + * resource. This algorithm is inspired by the XMalloc memory allocator + * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) + * and its implementation in ScatterAlloc + * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604) + * XMallocSIMD is inteded to be used with Nvidia CUDA capable + * accelerators that support at least compute capability 2.0 + * + * @tparam T_Config (optional) The configuration struct to overwrite + * default configuration. The default can be obtained through + * XMallocSIMD<>::Properties + */ + template + class XMallocSIMD + { + private: + using uint32 = std::uint32_t; + bool can_use_coalescing; + uint32 warpid; + uint32 myoffset; + uint32 threadcount; + uint32 req_size; + + public: + using Properties = T_Config; + + template + ALPAKA_FN_ACC XMallocSIMD(AlpakaAcc const& acc) + : can_use_coalescing(false) + , warpid(warpid_withinblock(acc)) + , myoffset(0) + , threadcount(0) + , req_size(0) + { + } + + private: +/** Allow for a hierarchical validation of parameters: + * + * shipped default-parameters (in the inherited struct) have lowest precedence. + * They will be overridden by a given configuration struct. However, even the + * given configuration struct can be overridden by compile-time command line + * parameters (e.g. -D MALLOCMC_DP_XMALLOCSIMD_PAGESIZE 1024) + * + * default-struct < template-struct < command-line parameter + */ +#ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE +# define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE (Properties::pagesize) +#endif + static constexpr uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE; + + public: + static constexpr uint32 _pagesize = pagesize; + + template + ALPAKA_FN_ACC auto collect(AlpakaAcc const& acc, uint32 bytes) -> uint32 + { + can_use_coalescing = false; + myoffset = 0; + threadcount = 0; + + // init with initial counter + auto& warp_sizecounter + = alpaka::declareSharedVar()], __COUNTER__>( + acc); + warp_sizecounter[warpid] = 16; + + // second half: make sure that all coalesced allocations can fit + // within one page necessary for offset calculation + bool const coalescible = bytes > 0 && bytes < (pagesize / 32); + +#if (MALLOCMC_DEVICE_COMPILE) + threadcount = alpaka::popcount(alpaka::warp::ballot(acc, coalescible)); +#else + threadcount = 1; // TODO +#endif + if(coalescible && threadcount > 1) + { + myoffset = alpaka::atomicOp(acc, &warp_sizecounter[warpid], bytes); + can_use_coalescing = true; + } + + req_size = bytes; + if(can_use_coalescing) + req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0; + + return req_size; + } + + template + ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void* + { + auto& warp_res + = alpaka::declareSharedVar()], __COUNTER__>(acc); + + char* myalloc = (char*) allocatedMem; + if(req_size && can_use_coalescing) + { + warp_res[warpid] = myalloc; + if(myalloc != 0) + *(uint32*) myalloc = threadcount; + } + + threadfenceBlock(acc); + + void* myres = myalloc; + if(can_use_coalescing) + { + if(warp_res[warpid] != 0) + myres = warp_res[warpid] + myoffset; + else + myres = 0; + } + return myres; + } + + ALPAKA_FN_HOST + static auto classname() -> std::string + { + std::stringstream ss; + ss << "XMallocSIMD[" << pagesize << "]"; + return ss.str(); + } + }; + + } // namespace DistributionPolicies + +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC.cuh b/include/mallocMC/mallocMC.cuh new file mode 100644 index 0000000000..eec79ac63c --- /dev/null +++ b/include/mallocMC/mallocMC.cuh @@ -0,0 +1,184 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2025 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/alignmentPolicies/Shrink.hpp" +#include "mallocMC/creationPolicies/FlatterScatter.hpp" +#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp" + +#include +#include + +#include + +namespace mallocMC +{ + // This namespace implements an alpaka-agnostic interface by choosing some reasonable defaults working fine for + // CUDA devices. Further below, we export the necessary names to the global mallocMC:: namespace. See below if + // you're only interested in usage. Look inside if you want to understand what we've done here or want to port this + // to other architectures. + namespace detail + { + using Dim = alpaka::DimInt<1>; + using Idx = std::uint32_t; + using Acc = alpaka::AccGpuCudaRt; + + // Hide the alpaka-specific Acc argument of `ReservePoolPolicies::AlpakaBuf`. + using CudaAlpakaBuf = ReservePoolPolicies::AlpakaBuf; + + /** + * @brief Allocator template with hidden alpaka-specifics. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = CudaAlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + using CudaAllocator = Allocator< + alpaka::AccToTag, + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + /** + * @brief Host-side infrastructure needed for setting up everything. + * + * You need to create an instance of this on the host. It provides the alpaka infrastructure and sets up + * everything on the device side, so you can get started allocating stuff. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + struct CudaHostInfrastructure + { + using MyAllocatorType = CudaAllocator< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + // Keep this first, so compiler-generated constructors can be called as just + // CudaHostInfrastructure<>{heapSize}; + size_t heapSize{}; + + // All of this is necessary alpaka infrastructure. + alpaka::Platform const platform{}; + std::remove_cv_t const dev{alpaka::getDevByIdx(platform, 0)}; + alpaka::Queue queue{dev}; + + // This is our actual host-side instance of the allocator. It sets up everything on the device and provides + // the handle that we can pass to kernels. + MyAllocatorType hostInstance{dev, queue, heapSize}; + }; + + /** + * @brief Memory manager to pass to kernels. + * + * Create this on the host and pass it to your kernels. It's a lightweight object barely more than a pointer, + * so you can just copy it around as needed. Its main purpose is to provide an alpaka-agnostic interface by + * adding an accelerator internally before forwarding malloc/free calls to mallocMC. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + struct CudaMemoryManager + { + using MyHostInfrastructure = CudaHostInfrastructure< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + /** + * @brief Construct the memory manager from the host infrastructure. + * + * @param hostInfrastructure Reference to the host infrastructure. + */ + explicit CudaMemoryManager(MyHostInfrastructure const& hostInfrastructure) + : deviceHandle(hostInfrastructure.hostInstance.getAllocatorHandle()) + { + } + + /** + * @brief Allocates memory on the device. + * + * @param size Size of the memory to allocate. + * @return Pointer to the allocated memory. + */ + __device__ __forceinline__ void* malloc(size_t size) + { + // This is cheating a tiny little bit. The accelerator could, in general, be a stateful object but + // concretely for CUDA and HIP it just forwards to the corresponding API calls, so it doesn't actually + // carry any information by itself. We're rather using it as a tag here. + std::array fakeAccMemory{}; + return deviceHandle.malloc(*reinterpret_cast(fakeAccMemory.data()), size); + } + + /** + * @brief Frees memory on the device. + * + * @param ptr Pointer to the memory to free. + */ + __device__ __forceinline__ void free(void* ptr) + { + std::array fakeAccMemory{}; + deviceHandle.free(*reinterpret_cast(fakeAccMemory.data()), ptr); + } + + /** + * @brief Handle to the device allocator. + * + * This is what actually does the work in mallocMC. We forward all our calls to this. + */ + MyHostInfrastructure::MyAllocatorType::AllocatorHandle deviceHandle; + }; + } // namespace detail + + // Use the following in your native CUDA code and you are good to go! All alpaka-specific interfaces are patched + // away. + using detail::CudaAllocator; + using detail::CudaHostInfrastructure; + using detail::CudaMemoryManager; + + namespace ReservePoolPolicies + { + // This is provided because the original ReservePoolPolicies::AlpakaBuf takes an alpaka::Acc tag as template + // argument. In contrast, this is alpaka-agnostic. + using detail::CudaAlpakaBuf; + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC.hpp b/include/mallocMC/mallocMC.hpp new file mode 100644 index 0000000000..41bf488209 --- /dev/null +++ b/include/mallocMC/mallocMC.hpp @@ -0,0 +1,59 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Bernhard Kainz - kainz ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +// generic stuff +#include "version.hpp" + +// core functionality +#include "mallocMC_hostclass.hpp" + +// all the policies +#include "alignmentPolicies/Noop.hpp" +#include "alignmentPolicies/Shrink.hpp" +#include "creationPolicies/FlatterScatter.hpp" +#include "creationPolicies/GallatinCuda.hpp" +#include "creationPolicies/OldMalloc.hpp" +#include "creationPolicies/Scatter.hpp" +#include "distributionPolicies/Noop.hpp" +#include "distributionPolicies/XMallocSIMD.hpp" +#include "oOMPolicies/BadAllocException.hpp" +#include "oOMPolicies/ReturnNull.hpp" +#include "reservePoolPolicies/AlpakaBuf.hpp" +#include "reservePoolPolicies/CudaSetLimits.hpp" +#include "reservePoolPolicies/Noop.hpp" diff --git a/include/mallocMC/mallocMC_allocator_handle.hpp b/include/mallocMC/mallocMC_allocator_handle.hpp new file mode 100644 index 0000000000..1da222fa4d --- /dev/null +++ b/include/mallocMC/mallocMC_allocator_handle.hpp @@ -0,0 +1,65 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2015 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +namespace mallocMC +{ + template + struct AllocatorHandleImpl + { + using DevAllocator = typename T_HostAllocator::DevAllocator; + + DevAllocator* devAllocator; + + explicit AllocatorHandleImpl(DevAllocator* p) : devAllocator(p) + { + } + + template + ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t size) -> void* + { + return devAllocator->malloc(acc, size); + } + + template + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) + { + devAllocator->free(acc, p); + } + + template + ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned + { + return devAllocator->getAvailableSlots(acc, slotSize); + } + }; + +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC_constraints.hpp b/include/mallocMC/mallocMC_constraints.hpp new file mode 100644 index 0000000000..0fb2099dfe --- /dev/null +++ b/include/mallocMC/mallocMC_constraints.hpp @@ -0,0 +1,91 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "creationPolicies/Scatter.hpp" +#include "distributionPolicies/XMallocSIMD.hpp" + +namespace mallocMC +{ + /** The default PolicyCheckers (do always succeed) + */ + template + class PolicyCheck1 + { + }; + + template + class PolicyCheck2 + { + }; + + template + class PolicyCheck3 + { + }; + + template + class PolicyCheck4 + { + }; + + template + class PolicyCheck5 + { + }; + + /** Enforces constraints on policies or combinations of polices + * + * Uses template specialization of PolicyChecker + */ + template< + typename T_CreationPolicy, + typename T_DistributionPolicy, + typename T_OOMPolicy, + typename T_GetHeapPolicy, + typename T_AlignmentPolicy> + + class PolicyConstraints : PolicyCheck2 + { + }; + + /** Scatter and XMallocSIMD need the same pagesize! + * + * This constraint ensures that if the CreationPolicy "Scatter" and the + * DistributionPolicy "XMallocSIMD" are selected, they are configured to use + * the same value for their "pagesize"-parameter. + */ + template + class PolicyCheck2, typename DistributionPolicies::XMallocSIMD> + { + static_assert(x::pagesize == z::pagesize, "Pagesize must be the same when combining Scatter and XMallocSIMD"); + }; + +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC_hostclass.hpp b/include/mallocMC/mallocMC_hostclass.hpp new file mode 100644 index 0000000000..48bc1f748b --- /dev/null +++ b/include/mallocMC/mallocMC_hostclass.hpp @@ -0,0 +1,33 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2015 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "allocator.hpp" +#include "device_allocator.hpp" +#include "mallocMC_traits.hpp" diff --git a/include/mallocMC/mallocMC_traits.hpp b/include/mallocMC/mallocMC_traits.hpp new file mode 100644 index 0000000000..091687e149 --- /dev/null +++ b/include/mallocMC/mallocMC_traits.hpp @@ -0,0 +1,39 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2014 - 2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +namespace mallocMC +{ + template + struct Traits + { + static constexpr bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots; + }; +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC_utils.hpp b/include/mallocMC/mallocMC_utils.hpp new file mode 100644 index 0000000000..b8b6d49cc0 --- /dev/null +++ b/include/mallocMC/mallocMC_utils.hpp @@ -0,0 +1,216 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#include + +#ifdef _MSC_VER +# include +#endif + +#include +#include + +/* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__ + * only is used to detect the device compile path. + * Since we require devices with support for ballot we can high-jack __HIP_ARCH_HAS_WARP_BALLOT__. + */ +#if (defined(__HIP_ARCH_HAS_WARP_BALLOT__) || defined(__CUDA_ARCH__) || __HIP_DEVICE_COMPILE__ == 1) +# define MALLOCMC_DEVICE_COMPILE 1 +#endif + +namespace mallocMC +{ + + template + constexpr uint32_t warpSize = 1U; + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + constexpr uint32_t warpSize> = 32U; +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +# if (HIP_VERSION_MAJOR >= 4) + template + constexpr uint32_t warpSize> = __AMDGCN_WAVEFRONT_SIZE; +# else + template + constexpr uint32_t warpSize> = 64; +# endif +#endif + + ALPAKA_FN_ACC inline auto laneid() + { +#if defined(__CUDA_ARCH__) + std::uint32_t mylaneid; + asm("mov.u32 %0, %%laneid;" : "=r"(mylaneid)); + return mylaneid; +#elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__) + return __lane_id(); +#else + return 0U; +#endif + } + + /** warp index within a multiprocessor + * + * Index of the warp within the multiprocessor at the moment of the query. + * The result is volatile and can be different with each query. + * + * @return current index of the warp + */ + template + ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t + { + return 0U; + } + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is + // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.` + inline __device__ auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t + { + std::uint32_t mywarpid = 0; + asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); + return mywarpid; + } +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { + // get wave id + // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024 + return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4)); + } +#endif + + template + ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t + { + return 0U; + } + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + inline __device__ auto smid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t + { + std::uint32_t mysmid = 0; + asm("mov.u32 %0, %%smid;" : "=r"(mysmid)); + return mysmid; + } +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { + return __smid(); + } +#endif + + template + ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/) + { + return 0U; + } +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + inline __device__ uint32_t lanemask_lt(alpaka::AccGpuCudaRt const& /*acc*/) + { + std::uint32_t lanemask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask)); + return lanemask; + } +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt const& /*acc*/) + { + return __lanemask_lt(); + } +#endif + + + /** the maximal number threads per block, valid for sm_2.X - sm_7.5 + * + * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities + */ + constexpr uint32_t maxThreadsPerBlock = 1024U; + + /** warp id within a cuda block + * + * The id is constant over the lifetime of the thread. + * The id is not equal to warpid(). + * + * @return warp id within the block + */ + template + ALPAKA_FN_ACC inline auto warpid_withinblock(AlpakaAcc const& acc) -> std::uint32_t + { + auto const localId = alpaka::mapIdx<1>( + alpaka::getIdx(acc), + alpaka::getWorkDiv(acc))[0]; + return localId / warpSize; + } + + template && std::is_integral_v>> + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T + { + return (numerator + (denominator - 1)) / denominator; + } + + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf( + void const* const pointer, + void const* const start, + T_size const stepSize) -> std::make_signed_t + { + return std::distance(reinterpret_cast(start), reinterpret_cast(pointer)) / stepSize; + } + + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target) + { + return alpaka::atomicCas(acc, &target, static_cast(0U), static_cast(0U)); + } +} // namespace mallocMC diff --git a/include/mallocMC/oOMPolicies/BadAllocException.hpp b/include/mallocMC/oOMPolicies/BadAllocException.hpp new file mode 100644 index 0000000000..7d7dfcad3a --- /dev/null +++ b/include/mallocMC/oOMPolicies/BadAllocException.hpp @@ -0,0 +1,78 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "BadAllocException.hpp" + +#include + +#include +#include + +namespace mallocMC +{ + namespace OOMPolicies + { + /** + * @brief Throws a std::bad_alloc exception on OutOfMemory + * + * This OOMPolicy will throw a std::bad_alloc exception, if the + * accelerator supports it. Currently, Nvidia CUDA does not support any + * form of exception handling, therefore handleOOM() does not have any + * effect on these accelerators. Using this policy on other types of + * accelerators that do not support exceptions results in undefined + * behaviour. + */ + struct BadAllocException + { + ALPAKA_FN_ACC + static auto handleOOM(void* mem) -> void* + { +#if BOOST_LANG_CUDA || BOOST_COMP_HIP +// #if __CUDA_ARCH__ < 350 +# define PM_EXCEPTIONS_NOT_SUPPORTED_HERE +// #endif +#endif + +#ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE +# undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE + assert(false); +#else + throw std::bad_alloc{}; +#endif + return mem; + } + + static auto classname() -> std::string + { + return "BadAllocException"; + } + }; + + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/include/mallocMC/oOMPolicies/ReturnNull.hpp b/include/mallocMC/oOMPolicies/ReturnNull.hpp new file mode 100644 index 0000000000..72b041db26 --- /dev/null +++ b/include/mallocMC/oOMPolicies/ReturnNull.hpp @@ -0,0 +1,61 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "ReturnNull.hpp" + +#include + +#include + +namespace mallocMC +{ + namespace OOMPolicies + { + /** + * @brief Returns a nullptr pointer on OutOfMemory conditions + * + * This OOMPolicy will return nullptr, if handleOOM() is called. + */ + class ReturnNull + { + public: + ALPAKA_FN_ACC + static auto handleOOM([[maybe_unused]] void* mem) -> void* + { + return nullptr; + } + + static auto classname() -> std::string + { + return "ReturnNull"; + } + }; + + } // namespace OOMPolicies +} // namespace mallocMC diff --git a/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp b/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp new file mode 100644 index 0000000000..4426b3c19e --- /dev/null +++ b/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp @@ -0,0 +1,65 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2020-2024 Helmholtz-Zentrum Dresden - Rossendorf, + CERN + + Author(s): Bernhard Manfred Gruber + Julian J. Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include + +namespace mallocMC +{ + namespace ReservePoolPolicies + { + template + struct AlpakaBuf + { + template + auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* + { + poolBuffer = std::make_unique(alpaka::allocBuf(dev, memsize)); + return alpaka::getPtrNative(*poolBuffer); + } + + void resetMemPool() + { + poolBuffer = {}; + } + + static auto classname() -> std::string + { + return "AlpakaBuf"; + } + + private: + using PoolBufferType = alpaka::Buf, unsigned char, alpaka::DimInt<1>, size_t>; + std::unique_ptr poolBuffer; // FIXME(bgruber): replace by std::optional<> + }; + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp new file mode 100644 index 0000000000..b94daed826 --- /dev/null +++ b/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp @@ -0,0 +1,85 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + +# include "CudaSetLimits.hpp" + +# include + +# include +# include + +namespace mallocMC +{ + namespace ReservePoolPolicies + { + /** + * @brief set CUDA internal heap for device-side malloc calls + * + * This ReservePoolPolicy is intended for use with CUDA capable + * accelerators that support at least compute capability 2.0. It should + * be used in conjunction with a CreationPolicy that actually requires + * the CUDA-internal heap to be sized by calls to cudaDeviceSetLimit(). + * + * This policy sets the cudaLimitMallocHeapSize device limit. This value + * can no longer be changed once a kernel using ::malloc()/::free() has + * been run. Subsequent attempts will result in errors unless the device + * is reset via cudaDeviceReset(). See: + * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g05956f16eaa47ef3a4efee84563ccb7d + */ + // TODO alpaka + struct CudaSetLimits + { + template + auto setMemPool(AlpakaDev const& /*dev*/, size_t memsize) -> void* + { + cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize); + return nullptr; + } + + static void resetMemPool() + { + cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U); + cudaGetLastError(); // cudaDeviceSetLimit() usually fails if any + // kernel before used ::malloc(), so let's + // clear the error state + } + + static auto classname() -> std::string + { + return "CudaSetLimits"; + } + }; + + } // namespace ReservePoolPolicies +} // namespace mallocMC + +#endif diff --git a/include/mallocMC/reservePoolPolicies/Noop.hpp b/include/mallocMC/reservePoolPolicies/Noop.hpp new file mode 100644 index 0000000000..57bf826149 --- /dev/null +++ b/include/mallocMC/reservePoolPolicies/Noop.hpp @@ -0,0 +1,60 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +namespace mallocMC +{ + namespace ReservePoolPolicies + { + /** + * @brief Does exactly nothing. + * + * This is intended for use with prototypes that were originally designed + * to handle these aspects on their own. Currently needed for GallatinCuda. + */ + struct Noop + { + template + auto setMemPool(AlpakaDev const& /*dev*/, size_t /*memsize*/) -> void* + { + return nullptr; + } + + static void resetMemPool() + { + } + + static auto classname() -> std::string + { + return "Noop"; + } + }; + + } // namespace ReservePoolPolicies +} // namespace mallocMC diff --git a/include/mallocMC/version.hpp b/include/mallocMC/version.hpp new file mode 100644 index 0000000000..89b9424047 --- /dev/null +++ b/include/mallocMC/version.hpp @@ -0,0 +1,48 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Bernhard Kainz - kainz ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +/** the mallocMC version: major API changes should be reflected here */ +#define MALLOCMC_VERSION_MAJOR 3 +#define MALLOCMC_VERSION_MINOR 0 +#define MALLOCMC_VERSION_PATCH 0 + +/** the mallocMC flavor is used to differentiate the releases of the + * Computational Radiation Physics group (crp) from other releases + * This should be useful to avoid versioning conflicts */ +#define MALLOCMC_FLAVOR "crp" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000000..8e2d32d641 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(Examples LANGUAGES CXX) + +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/unit + ${CMAKE_BINARY_DIR}/test/unit +) + +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/multithreaded + ${CMAKE_BINARY_DIR}/test/multithreaded +) + +add_custom_target( + mallocMCTests + DEPENDS mallocMCUnitTests mallocMCMultithreadedTests + COMMENT "Shortcut for building all tests" +) diff --git a/test/multithreaded/CMakeLists.txt b/test/multithreaded/CMakeLists.txt new file mode 100644 index 0000000000..76dc15e516 --- /dev/null +++ b/test/multithreaded/CMakeLists.txt @@ -0,0 +1,72 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(mallocMCMultithreadedTests LANGUAGES CXX) + +# ---- Options ---- + +option(mallocMC_ENABLE_TEST_COVERAGE "Enable test coverage" OFF) +option(mallocMC_TEST_INSTALLED_VERSION "Test the version found by find_package" OFF) + +# --- Import tools ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + +# ---- Dependencies ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/add_controlled.cmake) + +add_controlled("alpaka" REQUIRED PREFIX mallocMC) +add_controlled("Catch2" REQUIRED PREFIX mallocMC) + +if(NOT TARGET mallocMC) + if(mallocMC_TEST_INSTALLED_VERSION) + find_package(mallocMC REQUIRED) + else() + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) + endif() +endif() + +# ---- Create binary ---- + +file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/*.cpp) +alpaka_add_executable(${PROJECT_NAME} ${sources}) +target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka Catch2::Catch2WithMain) +set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) + +# enable compiler warnings +if(NOT mallocMC_TEST_INSTALLED_VERSION) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") + target_compile_options( + mallocMC + INTERFACE + -Wall + # nvcc generate C code which uses directives GCC complains about like this: + # warning: style of line directive is a GCC extension + # So we can't use -pedantic here. + $<$>:-Wpedantic> + -Wextra + # Somehow, with the commandline that CMake composes nvcc misinterprets the flag + # after -Werror as an argument to -Werror leading to errors like + # nvcc fatal : Value '-Wpedantic' is not defined for option 'Werror' + # So, we can't compile with -Werror for nvcc. + $<$>:-Werror> + ) + elseif(MSVC) + target_compile_options(mallocMC INTERFACE /W4 /WX) + endif() +endif() + +# ---- Add mallocMCTests ---- + +enable_testing() +add_test(${PROJECT_NAME} ${PROJECT_NAME}) + +# ---- code coverage ---- + +if(mallocMC_ENABLE_TEST_COVERAGE) + target_compile_options(mallocMC INTERFACE -O0 -g -fprofile-arcs -ftest-coverage) + target_link_options(mallocMC INTERFACE -fprofile-arcs -ftest-coverage) +endif() diff --git a/test/multithreaded/source/AccessBlock.cpp b/test/multithreaded/source/AccessBlock.cpp new file mode 100644 index 0000000000..8acfa6bc30 --- /dev/null +++ b/test/multithreaded/source/AccessBlock.cpp @@ -0,0 +1,927 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock; + +using Dim = alpaka::DimInt<1>; +using Idx = std::uint32_t; + + +constexpr uint32_t pageSize = 1024; +constexpr uint32_t numPages = 4; +// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): +constexpr uint32_t pteSize = 4 + 4; +constexpr uint32_t blockSize = numPages * (pageSize + pteSize); + +using MyAccessBlock = AccessBlock, AlignmentPolicy>; +using std::span; + +// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the +// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen +// regardless of the underlying access optimisations etc. + +struct FillWith +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + AccessBlock, AlignmentPolicy>* accessBlock, + uint32_t const chunkSize, + void** result, + uint32_t const size) const -> void + { + std::generate( + result, + result + size, + [&acc, accessBlock, chunkSize]() + { + void* pointer{nullptr}; + while(pointer == nullptr) + { + pointer = accessBlock->create(acc, chunkSize); + } + return pointer; + }); + } +}; + +struct ContentGenerator +{ + uint32_t counter{0U}; + + ALPAKA_FN_ACC auto operator()() -> uint32_t + { + return counter++; + } +}; + +ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) +{ + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < size) + { + functor(idx); + } + } +} + +struct Create +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); + } + + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); + } +}; + +struct CreateUntilSuccess +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct Destroy +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const + { + forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); + } +}; + +struct IsValid +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + auto* accessBlock, + void** pointers, + bool* results, + uint32_t const size) const + { + std::span tmpPointers(pointers, size); + std::span tmpResults(results, size); + std::transform( + std::begin(tmpPointers), + std::end(tmpPointers), + std::begin(tmpResults), + [&acc, accessBlock](auto pointer) { return accessBlock->isValid(acc, pointer); }); + } +}; + +using Host = alpaka::AccCpuSerial; + +template +struct Buffer +{ + TDevAcc m_devAcc; + TDevHost m_devHost; + + alpaka::Vec m_extents; + + alpaka::Buf m_onDevice; + alpaka::Buf m_onHost; + + Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) + : m_devAcc{devAcc} + , m_devHost{devHost} + , m_extents{extents} + , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) + , m_onHost(alpaka::allocBuf(devHost, m_extents)) + { + } +}; + +template +auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) +{ + return Buffer{devHost, devAcc, extents}; +} + +auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) +{ + auto chunkSizes = makeBuffer(devHost, devAcc, 2U); + chunkSizes.m_onHost[0] = 32U; + chunkSizes.m_onHost[1] = 512U; + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + return chunkSizes; +} + +auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) +{ + auto pointers = makeBuffer(devHost, devAcc, size); + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + return pointers; +} + +template +auto setup() +{ + alpaka::Platform const platformAcc = {}; + alpaka::Platform> const platformHost = {}; + alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); + alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); + alpaka::Queue queue{devAcc}; + return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); +} + +template +auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers +{ + if constexpr(std::is_same_v, alpaka::TagCpuSerial>) + { + return {{1U}, {1U}, {numElements}}; + } + else + { + alpaka::KernelCfg const kernelCfg + = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); + } +} + +template +auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + FillWith{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(pointers.m_onDevice), + pointers.m_extents[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); +} + +template +auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + fillWith(queue, accessBlock, chunkSize, pointers); + auto* pointer1 = pointers.m_onHost[0]; + + // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in + // devPointers, so we don't need to wait for the copy before to finish. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); + alpaka::wait(queue); + return pointer1; +} + +template +auto freeAllButOneOnFirstPage( + auto& queue, + AccessBlock, AlignmentPolicy>* accessBlock, + auto& pointers) +{ + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmp), std::end(tmp)); + // This points to the first chunk of page 0. + auto* pointer1 = tmp[0]; + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + alpaka::wait(queue); + auto size + = pointers.m_extents[0] / AccessBlock, AlignmentPolicy>::numPages() - 1; + // Delete all other chunks on page 0. + customExec( + queue, + pointers.m_devAcc, + size, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); + alpaka::wait(queue); + return pointer1; +} + +struct CheckContent +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) + const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); + } + } + } +}; + +template +auto checkContent( + auto& devHost, + auto& devAcc, + auto& queue, + auto& pointers, + auto& content, + auto& workDiv, + auto const chunkSize) +{ + auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + alpaka::exec( + queue, + workDiv, + CheckContent{}, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + alpaka::getPtrNative(results.m_onDevice), + chunkSize); + alpaka::wait(queue); + alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); + alpaka::wait(queue); + + + std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); + + return writtenCorrectly; +} + +struct GetAvailableSlots +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const + { + *result = accessBlock->getAvailableSlots(acc, chunkSize); + }; +}; + +template +auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::wait(queue); + auto result = makeBuffer(devHost, devAcc, 1U); + alpaka::wait(queue); + alpaka::exec( + queue, + workDivSingleThread, + GetAvailableSlots{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(result.m_onDevice)); + alpaka::wait(queue); + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + auto tmp = result.m_onHost[0]; + alpaka::wait(queue); + return tmp; +} + +template +auto pageIndex(AccessBlock, AlignmentPolicy>* accessBlock, auto* pointer) +{ + // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). + // But we assume that the access block starts with the first page, so the pointer to the first page equals the + // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. + return mallocMC::indexOf(pointer, accessBlock, T_pageSize); +} + +struct FillAllUpAndWriteToThem +{ + ALPAKA_FN_ACC auto operator()( + auto const& acc, + auto* accessBlock, + auto* content, + span pointers, + auto chunkSize) const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + std::fill(begin, end, content[idx]); + } + } + } +}; + +struct CreateAndDestroMultipleTimes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + } + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct OversubscribedCreation +{ + uint32_t oversubscriptionFactor{}; + uint32_t availableSlots{}; + + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx + 1; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + + // CAUTION: The following lines have cost us more than a working day of debugging! + // If the hardware you're running on has a single program counter for the whole warp, the whole + // warp can't exit the while loop in case of even a single thread requesting another round. + // This implies that if we move the `.destroy()` out of the while loop, all the slots get + // filled up but the owning threads run idle instead of freeing them up again because they are + // waiting for their last companions to give their okay for exiting the loop. This is, of + // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in + // this scenario). So, this loop deadlocks and no thread ever exits. + // + // ... at least that's what we believe. If you're reading this comment, we might have been + // wrong about this. + if(pointers[idx] != nullptr) + { + accessBlock->destroy(acc, pointers[idx]); + } + } + pointers[idx] = nullptr; + } + + // We only keep some of the memory. In particular, we keep one chunk less than is available, + // such that threads looking for memory after we've finished can still find some. + while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct CreateAllChunkSizes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) + const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = accessBlock->create(acc, 1U); + + for(auto chunkSize : chunkSizes) + { + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + } + }); + } +}; + +template +auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) +{ + auto workDiv = createWorkDiv(devAcc, numElements, args...); + alpaka::exec(queue, workDiv, args...); + return workDiv; +} + +TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) +{ + using Acc = alpaka::TagToAcc; + auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); + auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); + alpaka::memset(queue, accessBlockBuf, 0x00); + alpaka::wait(queue); + auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); + auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); + auto pointers = createPointers( + devHost, + devAcc, + queue, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + alpaka::wait(queue); + + SECTION("creates second memory somewhere else.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + SECTION("creates memory of different chunk size in different pages.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) != pageIndex(accessBlock, pointers.m_onHost[1])); + } + + SECTION("creates partly for insufficient memory with same chunk size.") + { + uint32_t const size = 2U; + auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + // Okay, so here we start the actual test. The situation is the following: + // There is a single chunk available. + // We try to do two allocations. + // So, we expect one to succeed and one to fail. + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK( + ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) + or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); + } + + SECTION("does not race between clean up and create.") + { + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + auto freePage = pageIndex(accessBlock, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); + + // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + + alpaka::exec( + queue, + workDivSingleThread, + CreateUntilSuccess{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) == freePage); + } + + SECTION("destroys two pointers of different size.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + 2U, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, 2U); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + CHECK(not result.m_onHost[0]); + CHECK(not result.m_onHost[1]); + } + + SECTION("destroys two pointers of same size.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + 2U, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, 2U); + result.m_onHost[0] = true; + result.m_onHost[1] = true; + alpaka::memcpy(queue, result.m_onDevice, result.m_onHost); + alpaka::wait(queue); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + CHECK(not result.m_onHost[0]); + CHECK(not result.m_onHost[1]); + } + + SECTION("fills up all chunks in parallel and writes to them.") + { + auto content = makeBuffer( + devHost, + devAcc, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); + alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); + alpaka::wait(queue); + + auto workDiv = customExec( + queue, + devAcc, + pointers.m_extents[0], + FillAllUpAndWriteToThem{}, + accessBlock, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + auto writtenCorrectly + = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); + CHECK(writtenCorrectly); + } + + SECTION("destroys all pointers simultaneously.") + { + auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const allSlotsOfDifferentSize + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + customExec( + queue, + devAcc, + pointers.m_extents[0], + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + std::span tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]); + CHECK(std::none_of(std::cbegin(tmpResults), std::cend(tmpResults), [](auto const val) { return val; })); + + CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); + CHECK( + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) + == allSlotsOfDifferentSize); + } + + SECTION("creates and destroys multiple times.") + { + customExec( + queue, + devAcc, + pointers.m_extents[0], + CreateAndDestroMultipleTimes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("can handle oversubscription.") + { + uint32_t oversubscriptionFactor = 2U; + auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the + // end. + auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); + customExec( + queue, + devAcc, + manyPointers.m_extents[0], + OversubscribedCreation{oversubscriptionFactor, availableSlots}, + accessBlock, + span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); + alpaka::wait(queue); + + // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a + // nullptr. + std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; + + CHECK(std::all_of( + std::begin(tmpManyPointers), + beginNonNull, + [](auto const pointer) { return pointer == nullptr; })); + + std::sort(beginNonNull, std::end(tmpManyPointers)); + CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); + } + + SECTION("can handle many different chunk sizes.") + { + auto chunkSizes = makeBuffer(devHost, devAcc, pageSize); + std::span chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]); + std::iota(std::begin(chunkSizesSpan), std::end(chunkSizesSpan), 1U); + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + MyAccessBlock::numPages(), + CreateAllChunkSizes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), MyAccessBlock::numPages()), + std::span(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0])); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages()); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("creates second memory somewhere in multi-page mode.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + pageSize); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + alpaka::wait(queue); +} diff --git a/test/multithreaded/source/BitField.cpp b/test/multithreaded/source/BitField.cpp new file mode 100644 index 0000000000..3d65a047e1 --- /dev/null +++ b/test/multithreaded/source/BitField.cpp @@ -0,0 +1,92 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mocks.hpp" + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; +using namespace std::chrono_literals; + +// The following test is a particular regression test which (in its current form) requires to be able to stop a +// thread from the outside. This is not possible through the alpaka interface. Thus, we resort to running this with +// `std::jthread` but we have to ensure that the alpaka atomics work. Thus, the ifdef. +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + +TEST_CASE("Threaded BitMask") +{ + BitMask mask{}; + + SECTION("finds first free bit despite noise.") + { + // This is a regression test. An earlier version of this algorithm used to fail when other parts of the bit + // mask experienced frequent change during the search. We simulate this by letting a "noise thread" toggle + // unimportant bits while a "search thread" tries to find the first free bit. While the noise does not affect + // the result, a previous version of the algorithm does fail under these conditions (as verified by + // experiment). + + uint32_t const firstFreeIndex = GENERATE(0U, 1U, 10U); + for(uint32_t i = 0; i < firstFreeIndex; ++i) + { + mask.set(accSerial, i); + } + + uint32_t result = BitMaskSize; + auto noiseThread = std::jthread( + [&mask, firstFreeIndex](std::stop_token const& stopToken) + { + while(not stopToken.stop_requested()) + { + for(uint32_t i = firstFreeIndex + 1; i < BitMaskSize; ++i) + { + mask.flip(accSerial, i); + } + } + }); + std::thread([&mask, &result]() { result = mask.firstFreeBit(accSerial); }).join(); + std::this_thread::sleep_for(20ms); + CHECK(result == firstFreeIndex); + noiseThread.request_stop(); + } +} +#else +TEST_CASE("Threaded BitMask", "[!shouldfail]") +{ + FAIL("The Threaded BitMask regression test could not run because it is only available with the std::threads " + "backend enabled."); +} +#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED diff --git a/test/multithreaded/source/Scatter.cpp b/test/multithreaded/source/Scatter.cpp new file mode 100644 index 0000000000..2328b1cfcd --- /dev/null +++ b/test/multithreaded/source/Scatter.cpp @@ -0,0 +1,859 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include "mallocMC/creationPolicies/Scatter.hpp" + +#include "mallocMC/alignmentPolicies/Shrink.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/device_allocator.hpp" +#include "mallocMC/distributionPolicies/Noop.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/oOMPolicies/ReturnNull.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Dim = alpaka::DimInt<1>; +using Idx = std::uint32_t; + + +constexpr uint32_t pageSize = 1024; +constexpr uint32_t numPages = 4; +// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): +constexpr uint32_t pteSize = 8 + 4 + 4; +constexpr uint32_t blockSize = numPages * (pageSize + pteSize); + +template +struct ScatterHeapConfig +{ + static constexpr uint32_t const accessblocksize = T_blockSize; + static constexpr uint32_t const pagesize = T_pageSize; + static constexpr uint32_t const wastefactor = T_wasteFactor; + static constexpr uint32_t const regionsize = 1U; + static constexpr bool const resetfreedpages = true; +}; + +using MyScatter = mallocMC::CreationPolicies::Scatter< + ScatterHeapConfig>::AlignmentAwarePolicy>; +using MyDeviceAllocator = mallocMC::DeviceAllocator< + MyScatter, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::AlignmentPolicies::Shrink<>>; + +using std::span; + +// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the +// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen +// regardless of the underlying access optimisations etc. + +struct FillWith +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + auto* accessBlock, + uint32_t const chunkSize, + void** result, + uint32_t const size) const -> void + { + std::generate( + result, + result + size, + [&acc, accessBlock, chunkSize]() + { + void* pointer{nullptr}; + while(pointer == nullptr) + { + pointer = accessBlock->create(acc, chunkSize); + } + return pointer; + }); + } +}; + +struct ContentGenerator +{ + uint32_t counter{0U}; + + ALPAKA_FN_ACC auto operator()() -> uint32_t + { + return counter++; + } +}; + +ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) +{ + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < size) + { + functor(idx); + } + } +} + +struct Create +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); + } + + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); + } +}; + +struct CreateUntilSuccess +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct Destroy +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const + { + forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); + } +}; + +using Host = alpaka::AccCpuSerial; + +template +struct Buffer +{ + TDevAcc m_devAcc; + TDevHost m_devHost; + + alpaka::Vec m_extents; + + alpaka::Buf m_onDevice; + alpaka::Buf m_onHost; + + Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) + : m_devAcc{devAcc} + , m_devHost{devHost} + , m_extents{extents} + , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) + , m_onHost(alpaka::allocBuf(devHost, m_extents)) + { + } +}; + +template +auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) +{ + return Buffer{devHost, devAcc, extents}; +} + +auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) +{ + auto chunkSizes = makeBuffer(devHost, devAcc, 2U); + chunkSizes.m_onHost[0] = 32U; + chunkSizes.m_onHost[1] = 512U; + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + return chunkSizes; +} + +auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) +{ + auto pointers = makeBuffer(devHost, devAcc, size); + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + return pointers; +} + +template +auto setup() +{ + alpaka::Platform const platformAcc = {}; + alpaka::Platform> const platformHost = {}; + alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); + alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); + alpaka::Queue queue{devAcc}; + return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); +} + +template +auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers +{ + if constexpr(std::is_same_v, alpaka::TagCpuSerial>) + { + return {{1U}, {1U}, {numElements}}; + } + else + { + alpaka::KernelCfg const kernelCfg + = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); + } +} + +template +auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + FillWith{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(pointers.m_onDevice), + pointers.m_extents[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); +} + +template +auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + fillWith(queue, accessBlock, chunkSize, pointers); + auto* pointer1 = pointers.m_onHost[0]; + + // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in + // devPointers, so we don't need to wait for the copy before to finish. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); + alpaka::wait(queue); + return pointer1; +} + +template +auto freeAllButOneOnFirstPage(auto& queue, auto* accessBlock, auto& pointers) +{ + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmp), std::end(tmp)); + // This points to the first chunk of page 0. + auto* pointer1 = tmp[0]; + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + alpaka::wait(queue); + auto size = pointers.m_extents[0] / numPages - 1; + // Delete all other chunks on page 0. + customExec( + queue, + pointers.m_devAcc, + size, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); + alpaka::wait(queue); + return pointer1; +} + +struct CheckContent +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) + const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); + } + } + } +}; + +template +auto checkContent( + auto& devHost, + auto& devAcc, + auto& queue, + auto& pointers, + auto& content, + auto& workDiv, + auto const chunkSize) +{ + auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + alpaka::exec( + queue, + workDiv, + CheckContent{}, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + alpaka::getPtrNative(results.m_onDevice), + chunkSize); + alpaka::wait(queue); + alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); + alpaka::wait(queue); + + + std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); + + return writtenCorrectly; +} + +struct GetAvailableSlots +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const + { + *result = accessBlock->getAvailableSlots(acc, chunkSize); + }; +}; + +template +auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::wait(queue); + auto result = makeBuffer(devHost, devAcc, 1U); + alpaka::wait(queue); + alpaka::exec( + queue, + workDivSingleThread, + GetAvailableSlots{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(result.m_onDevice)); + alpaka::wait(queue); + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + auto tmp = result.m_onHost[0]; + alpaka::wait(queue); + return tmp; +} + +auto pageIndex(auto accessBlock, auto* pointer) +{ + // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). + // But we assume that the access block starts with the first page, so the pointer to the first page equals the + // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. + return mallocMC::indexOf(pointer, alpaka::getPtrNative(accessBlock), pageSize); +} + +struct FillAllUpAndWriteToThem +{ + ALPAKA_FN_ACC auto operator()( + auto const& acc, + auto* accessBlock, + auto* content, + span pointers, + auto chunkSize) const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + std::fill(begin, end, content[idx]); + } + } + } +}; + +struct CreateAndDestroMultipleTimes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + } + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct OversubscribedCreation +{ + uint32_t oversubscriptionFactor{}; + uint32_t availableSlots{}; + + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx + 1; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + + // CAUTION: The following lines have cost us more than a working day of debugging! + // If the hardware you're running on has a single program counter for the whole warp, the whole + // warp can't exit the while loop in case of even a single thread requesting another round. + // This implies that if we move the `.destroy()` out of the while loop, all the slots get + // filled up but the owning threads run idle instead of freeing them up again because they are + // waiting for their last companions to give their okay for exiting the loop. This is, of + // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in + // this scenario). So, this loop deadlocks and no thread ever exits. + // + // ... at least that's what we believe. If you're reading this comment, we might have been + // wrong about this. + if(pointers[idx] != nullptr) + { + accessBlock->destroy(acc, pointers[idx]); + } + } + pointers[idx] = nullptr; + } + + // We only keep some of the memory. In particular, we keep one chunk less than is available, + // such that threads looking for memory after we've finished can still find some. + while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct CreateAllChunkSizes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) + const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = accessBlock->create(acc, 1U); + + for(auto chunkSize : chunkSizes) + { + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + } + }); + } +}; + +template +auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) +{ + auto workDiv = createWorkDiv(devAcc, numElements, args...); + alpaka::exec(queue, workDiv, args...); + return workDiv; +} + +TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) +{ + using Acc = alpaka::TagToAcc; + auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); + auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); + auto dataBuf = alpaka::allocBuf, Idx>( + devAcc, + alpaka::Vec{1U}); + MyScatter::initHeap( + devAcc, + queue, + alpaka::getPtrNative(accessBlockBuf), + static_cast(alpaka::getPtrNative(dataBuf)), + blockSize); + alpaka::wait(queue); + auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); + auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); + auto pointers = createPointers( + devHost, + devAcc, + queue, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + alpaka::wait(queue); + + SECTION("creates second memory somewhere else.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + SECTION("creates memory of different chunk size in different pages.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) != pageIndex(dataBuf, pointers.m_onHost[1])); + } + + SECTION("creates partly for insufficient memory with same chunk size.") + { + uint32_t const size = 2U; + auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + // Okay, so here we start the actual test. The situation is the following: + // There is a single chunk available. + // We try to do two allocations. + // So, we expect one to succeed and one to fail. + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK( + ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) + or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); + } + + SECTION("does not race between clean up and create.") + { + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + auto freePage = pageIndex(dataBuf, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); + + // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + + alpaka::exec( + queue, + workDivSingleThread, + CreateUntilSuccess{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) == freePage); + } + + SECTION("destroys two pointers of different size.") + { + auto workDiv = customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + auto const beforeDestroy0 + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const beforeDestroy1 + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + + alpaka::exec( + queue, + workDiv, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto const afterDestroy0 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const afterDestroy1 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + + CHECK(beforeDestroy0 < afterDestroy0); + CHECK(beforeDestroy1 < afterDestroy1); + } + + SECTION("destroys two pointers of same size.") + { + auto workDiv = customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + auto const beforeDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + alpaka::exec( + queue, + workDiv, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto const afterDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + CHECK(beforeDestroy == afterDestroy - 2U); + } + + SECTION("fills up all chunks in parallel and writes to them.") + { + auto content = makeBuffer( + devHost, + devAcc, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); + alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); + alpaka::wait(queue); + + auto workDiv = customExec( + queue, + devAcc, + pointers.m_extents[0], + FillAllUpAndWriteToThem{}, + accessBlock, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + auto writtenCorrectly + = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); + CHECK(writtenCorrectly); + } + + SECTION("destroys all pointers simultaneously.") + { + auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const allSlotsOfDifferentSize + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + customExec( + queue, + devAcc, + pointers.m_extents[0], + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); + CHECK( + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) + == allSlotsOfDifferentSize); + } + + SECTION("creates and destroys multiple times.") + { + customExec( + queue, + devAcc, + pointers.m_extents[0], + CreateAndDestroMultipleTimes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("can handle oversubscription.") + { + uint32_t oversubscriptionFactor = 2U; + auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the + // end. + auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); + customExec( + queue, + devAcc, + manyPointers.m_extents[0], + OversubscribedCreation{oversubscriptionFactor, availableSlots}, + accessBlock, + span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); + alpaka::wait(queue); + + // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a + // nullptr. + std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; + + CHECK(std::all_of( + std::begin(tmpManyPointers), + beginNonNull, + [](auto const pointer) { return pointer == nullptr; })); + + std::sort(beginNonNull, std::end(tmpManyPointers)); + CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); + } + + SECTION("creates second memory somewhere in multi-page mode.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + pageSize); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + alpaka::wait(queue); +} diff --git a/test/multithreaded/source/mocks.hpp b/test/multithreaded/source/mocks.hpp new file mode 100644 index 0000000000..b1764d1302 --- /dev/null +++ b/test/multithreaded/source/mocks.hpp @@ -0,0 +1,76 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// This is very hacky: AccCpuSerial (and in general all Accellerators) are very reluctant to be instantiated, so we do +// it the oldschool way and simply malloc some memory pretending to be that accellerator. Let's hope that null-ing it +// is a valid initialisation. The final class only has one mutable data member, so that's probably not half bad but I +// didn't go through all those hundreds of base classes. Usually, we only need the time anyways. +inline auto constructAcc() +{ + using Acc = alpaka::AccCpuSerial, size_t>; + void* myPointer = malloc(sizeof(Acc)); + memset(myPointer, 0U, sizeof(Acc)); + return static_cast(myPointer); +} + +// +static inline auto const accPointer = constructAcc(); +static inline auto const& accSerial = *accPointer; + +template +struct HeapConfig +{ + static constexpr auto const accessblocksize = T_blockSize; + static constexpr auto const pagesize = T_pageSize; + static constexpr auto const wastefactor = T_wasteFactor; + static constexpr auto const resetfreedpages = T_resetfreedpages; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + return (chunkSize >= numBytes && chunkSize <= T_wasteFactor * numBytes); + } +}; + +struct AlignmentPolicy +{ + struct Properties + { + static constexpr uint32_t const dataAlignment = 1U; + }; +}; diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt new file mode 100644 index 0000000000..8f8345eb22 --- /dev/null +++ b/test/unit/CMakeLists.txt @@ -0,0 +1,72 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(mallocMCUnitTests LANGUAGES CXX) + +# ---- Options ---- + +option(mallocMC_ENABLE_TEST_COVERAGE "Enable test coverage" OFF) +option(mallocMC_TEST_INSTALLED_VERSION "Test the version found by find_package" OFF) + +# --- Import tools ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + +# ---- Dependencies ---- + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) +CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/add_controlled.cmake) + +add_controlled("alpaka" REQUIRED PREFIX mallocMC) +add_controlled("Catch2" REQUIRED PREFIX mallocMC) + +if(NOT TARGET mallocMC) + if(mallocMC_TEST_INSTALLED_VERSION) + find_package(mallocMC REQUIRED) + else() + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) + endif() +endif() + +# ---- Create binary ---- + +file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/*.cpp) +alpaka_add_executable(${PROJECT_NAME} ${sources}) +target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka Catch2::Catch2WithMain) +set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) + +# enable compiler warnings +if(NOT mallocMC_TEST_INSTALLED_VERSION) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") + target_compile_options( + mallocMC + INTERFACE + -Wall + # nvcc generate C code which uses directives GCC complains about like this: + # warning: style of line directive is a GCC extension + # So we can't use -pedantic here. + $<$>:-Wpedantic> + -Wextra + # Somehow, with the commandline that CMake composes nvcc misinterprets the flag + # after -Werror as an argument to -Werror leading to errors like + # nvcc fatal : Value '-Wpedantic' is not defined for option 'Werror' + # So, we can't compile with -Werror for nvcc. + $<$>:-Werror> + ) + elseif(MSVC) + target_compile_options(mallocMC INTERFACE /W4 /WX) + endif() +endif() + +# ---- Add mallocMCTests ---- + +enable_testing() +add_test(${PROJECT_NAME} ${PROJECT_NAME}) + +# ---- code coverage ---- + +if(mallocMC_ENABLE_TEST_COVERAGE) + target_compile_options(mallocMC INTERFACE -O0 -g -fprofile-arcs -ftest-coverage) + target_link_options(mallocMC INTERFACE -fprofile-arcs -ftest-coverage) +endif() diff --git a/test/unit/source/AccessBlock.cpp b/test/unit/source/AccessBlock.cpp new file mode 100644 index 0000000000..d335829868 --- /dev/null +++ b/test/unit/source/AccessBlock.cpp @@ -0,0 +1,655 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +template +struct TestableAccessBlock + : mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock +{ +public: + explicit TestableAccessBlock(auto const& acc) + : mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock(acc) {}; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::blockSize; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::pageSize; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::wasteFactor; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock:: + resetfreedpages; +}; + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskStorageType; +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; + +constexpr uint32_t const pageTableEntrySize = 8U; +constexpr uint32_t const pageSize1 = 1024U; +constexpr uint32_t const pageSize2 = 4096U; + +using AccessBlocks = std::tuple< + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>>; + +template +auto fillWith(TestableAccessBlock& accessBlock, uint32_t const chunkSize) + -> std::vector +{ + std::vector pointers(accessBlock.getAvailableSlots(accSerial, chunkSize)); + std::generate( + std::begin(pointers), + std::end(pointers), + [&accessBlock, chunkSize]() + { + void* pointer = accessBlock.create(accSerial, chunkSize); + REQUIRE(pointer != nullptr); + return pointer; + }); + return pointers; +} + +template +struct SelectivelyWastedHeapConfig : HeapConfig +{ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + auto currentWasteFactor = (numBytes == T_allowedToWasteNumBytes) ? T_wasteFactor : 1U; + return (chunkSize >= numBytes && chunkSize <= currentWasteFactor * numBytes); + } +}; + +TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks) +{ + using AccessBlock = TestType; + constexpr auto const blockSize = AccessBlock::blockSize; + constexpr auto const pageSize = AccessBlock::pageSize; + + AccessBlock accessBlock{accSerial}; + + SECTION("knows its number of pages.") + { + // The overhead from the metadata is small enough that this just happens to round down to the correct values. + // If you choose weird numbers, it might no longer. + CHECK(accessBlock.numPages() == blockSize / pageSize); + } + + SECTION("knows its available slots.") + { + uint32_t const chunkSize = GENERATE(1U, 2U, 32U, 57U, 1024U); + // This is not exactly true. It is only true because the largest chunk size we chose above is exactly the size + // of one page. In general, this number would be fractional for larger than page size chunks but I don't want + // to bother right now: + uint32_t slotsPerPage = chunkSize < pageSize ? PageInterpretation::numChunks(chunkSize) : 1U; + + uint32_t numOccupied = GENERATE(0U, 1U, 10U); + uint32_t actualNumOccupied = numOccupied; + for(uint32_t i = 0; i < numOccupied; ++i) + { + if(accessBlock.create(accSerial, chunkSize) == nullptr) + { + actualNumOccupied--; + } + } + + auto totalSlots = accessBlock.numPages() * slotsPerPage; + if(totalSlots > actualNumOccupied) + { + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == totalSlots - actualNumOccupied); + } + else + { + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == 0U); + } + } + + constexpr uint32_t const chunkSize = 32U; + + SECTION("creates") + { + SECTION("no nullptr if memory is available.") + { + // This is not a particularly hard thing to do because any uninitialised pointer that could be returned is + // most likely not exactly the nullptr. We just leave this in as it currently doesn't hurt anybody to keep + // it. + CHECK(accessBlock.create(accSerial, chunkSize) != nullptr); + } + + SECTION("memory that can be written to and read from.") + { + uint32_t const arbitraryValue = 42; + auto* ptr = static_cast(accessBlock.create(accSerial, chunkSize)); + REQUIRE(ptr != nullptr); + *ptr = arbitraryValue; + CHECK(*ptr == arbitraryValue); + } + + SECTION("second memory somewhere else.") + { + CHECK(accessBlock.create(accSerial, chunkSize) != accessBlock.create(accSerial, chunkSize)); + } + + SECTION("memory of different chunk size in different pages.") + { + constexpr uint32_t const chunkSize2 = 512U; + REQUIRE(chunkSize != chunkSize2); + // To be precise, the second call will actually return a nullptr if there is only a single page (which is + // one of the test cases at the time of writing). But that technically passes this test, too. + + CHECK( + accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize)) + != accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize2))); + } + + SECTION("nullptr if there's no page with fitting chunk size") + { + // This requests one chunk of a different chunk size for each page. As a new page is required each time, + // all pages have a chunk size set at the end. And none of those is `chunkSize`. + for(uint32_t index = 0; index < accessBlock.numPages(); ++index) + { + auto const differentChunkSize = chunkSize + 1U + index; + REQUIRE(chunkSize != differentChunkSize); + accessBlock.create(accSerial, differentChunkSize); + } + + CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); + } + + SECTION("nullptr if all pages have full filling level.") + { + fillWith(accessBlock, chunkSize); + CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); + } + + SECTION("last remaining chunk.") + { + auto pointers = fillWith(accessBlock, chunkSize); + uint32_t const index = GENERATE(0U, 1U, 42U); + void* pointer = pointers[std::min(index, static_cast(pointers.size()) - 1)]; + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.create(accSerial, chunkSize) == pointer); + } + + SECTION("memory larger than page size.") + { + if(accessBlock.numPages() >= 2U) + { + CHECK(accessBlock.isValid(accSerial, accessBlock.create(accSerial, 2U * pageSize))); + } + } + + SECTION("nullptr if chunkSize is larger than total available memory in pages.") + { + // larger than the available memory but in some cases smaller than the block size even after subtracting + // the space for the page table: + uint32_t const excessiveChunkSize = accessBlock.numPages() * pageSize + 1U; + CHECK(accessBlock.create(accSerial, excessiveChunkSize) == nullptr); + } + + SECTION("in the correct place for larger than page size.") + { + // we want to allocate 2 pages: + if(accessBlock.numPages() > 1U) + { + auto pointers = fillWith(accessBlock, pageSize); + std::sort(std::begin(pointers), std::end(pointers)); + + // Now, we free two contiguous chunks such that there is one deterministic spot wherefrom our request + // can be served. + uint32_t index = GENERATE(0U, 1U, 5U); + index = std::min(index, static_cast(pointers.size()) - 2U); + accessBlock.destroy(accSerial, pointers[index]); + accessBlock.destroy(accSerial, pointers[index + 1]); + + // Must be exactly where we free'd the pages: + CHECK( + accessBlock.pageIndex(accessBlock.create(accSerial, 2U * pageSize)) + == static_cast(index)); + } + } + + SECTION("a pointer and knows it's valid afterwards.") + { + void* pointer = accessBlock.create(accSerial, chunkSize); + CHECK(accessBlock.isValid(accSerial, pointer)); + } + + SECTION("the last pointer in page and its allocation does not reach into the bit field.") + { + auto slots = accessBlock.getAvailableSlots(accSerial, chunkSize); + // Find the last allocation on the first page: + auto pointers = fillWith(accessBlock, chunkSize); + std::sort(std::begin(pointers), std::end(pointers)); + auto lastOfPage0 = pointers[slots / accessBlock.numPages() - 1]; + + // Free the first bit of the bit field by destroying the first allocation in the first page: + accessBlock.destroy(accSerial, pointers[0]); + REQUIRE(not accessBlock.isValid(accSerial, pointers[0])); + + // Write all ones to the last of the first page: If there is an overlap between the region of the last + // chunk and the bit field, our recently free'd first chunk will have its bit set by this operation. + char* begin = reinterpret_cast(lastOfPage0); + auto* end = begin + chunkSize; + std::fill(begin, end, 255U); + + // Now, we try to allocate one more chunk. It must be the one we free'd before. + CHECK(accessBlock.create(accSerial, chunkSize) == pointers[0]); + REQUIRE(accessBlock.isValid(accSerial, pointers[0])); + } + + SECTION("and writes something very close to page size.") + { + // This is a regression test. The original version of the code started to use multi-page mode when numBytes + // >= pageSize. That is too late because if we're not in multi-page mode, we need to leave some space for + // the bit mask. Thus, the following test would corrupt the bit mask, if we were to allocate this in + // chunked mode. + +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + REQUIRE(sizeof(BitMaskStorageType<>) > 1U); + auto localChunkSize = pageSize - 1U; + auto slots = accessBlock.getAvailableSlots(accSerial, localChunkSize); + auto pointer = accessBlock.create(accSerial, localChunkSize); + REQUIRE(slots == accessBlock.getAvailableSlots(accSerial, localChunkSize) + 1); + memset(pointer, 0, localChunkSize); + CHECK_NOTHROW(accessBlock.destroy(accSerial, pointer)); +#else + SUCCEED("This bug actually never had any observable behaviour in NDEBUG mode because the corrupted bit " + "mask is never read again."); +#endif // NDEBUG + } + + SECTION("with waste factor") + { + constexpr uint32_t const wastefactor = 3U; + TestableAccessBlock, AlignmentPolicy> wastedAccessBlock{ + accSerial}; + auto pointers = fillWith(wastedAccessBlock, chunkSize); + + auto smallerChunkSize = chunkSize / (wastefactor - 1U); + REQUIRE(smallerChunkSize < chunkSize); + + wastedAccessBlock.destroy(accSerial, pointers[0]); + + // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no + // available memory for this chunk size. + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, smallerChunkSize) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, smallerChunkSize) == nullptr); + + SECTION("knows its available slots.") + { + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, smallerChunkSize) == 1U); + } + + SECTION("creates a smaller chunk size.") + { + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); + } + + SECTION("fails to create too many smaller chunks.") + { + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == nullptr); + } + + SECTION("is not misled by mixing above and below multi-page threshold.") + { + auto const aboveMultiPageThreshold = pageSize - 2 * sizeof(BitMaskStorageType<>); + auto const belowMultiPageThreshold = aboveMultiPageThreshold / (wastefactor - 1U); + for(auto const pointer : pointers) + { + // free one page we want to operate on + if(wastedAccessBlock.isValid(accSerial, pointer) and wastedAccessBlock.pageIndex(pointer) == 0U) + { + wastedAccessBlock.destroy(accSerial, pointer); + } + } + REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, belowMultiPageThreshold) == 2U); + REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, aboveMultiPageThreshold) == 1U); + + // This allocates in multi-page mode. + CHECK(wastedAccessBlock.pageIndex(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold)) == 0U); + // This tries to allocate in chunked mode but the waste factor would allow to create on the just + // allocated page. This is, of course, forbidden. + CHECK(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold) == nullptr); + } + } + + SECTION("with waste function") + { + constexpr uint32_t const wastefactor = 3U; + constexpr uint32_t const selectedNumBytes = mallocMC::ceilingDivision(chunkSize, wastefactor); + TestableAccessBlock< + SelectivelyWastedHeapConfig, + AlignmentPolicy> + wastedAccessBlock{accSerial}; + auto pointers = fillWith(wastedAccessBlock, chunkSize); + + auto notSelectedNumBytes = chunkSize / (wastefactor - 1U); + + // Okay, so we want a scenario where both selectedNumBytes and notSelectedNumBytes are within the range of + // the waste factor but only for selectedNumBytes we'll actually get a waste-factor-like behaviour. + REQUIRE(selectedNumBytes < chunkSize); + REQUIRE(selectedNumBytes * wastefactor >= chunkSize); + REQUIRE(selectedNumBytes < notSelectedNumBytes); + REQUIRE(notSelectedNumBytes < chunkSize); + + wastedAccessBlock.destroy(accSerial, pointers[0]); + + // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no + // available memory for these chunk sizes. + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, notSelectedNumBytes) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, selectedNumBytes) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, selectedNumBytes) == nullptr); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, notSelectedNumBytes) == nullptr); + + SECTION("knows its available slots.") + { + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, selectedNumBytes) == 1U); + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, notSelectedNumBytes) == 0U); + } + + SECTION("creates a smaller chunk size.") + { + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); + } + + SECTION("fails to create too many smaller chunks.") + { + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == nullptr); + } + } + } + + SECTION("destroys") + { + void* pointer = accessBlock.create(accSerial, chunkSize); + REQUIRE(accessBlock.isValid(accSerial, pointer)); + + SECTION("a pointer thereby invalidating it.") + { + accessBlock.destroy(accSerial, pointer); + CHECK(not accessBlock.isValid(accSerial, pointer)); + } + + SECTION("the whole page if last pointer is destroyed.") + { + REQUIRE(chunkSize != pageSize); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 1); + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + } + + SECTION("not the whole page if there still exists a valid pointer.") + { + REQUIRE(chunkSize != pageSize); + auto unOccupiedPages = accessBlock.numPages(); + void* newPointer{nullptr}; + // We can't be sure which page is used for any allocation, so we allocate again and again until we have hit + // a page that already has an allocation: + while(accessBlock.getAvailableSlots(accSerial, pageSize) != unOccupiedPages) + { + unOccupiedPages = accessBlock.getAvailableSlots(accSerial, pageSize); + newPointer = accessBlock.create(accSerial, chunkSize); + } + accessBlock.destroy(accSerial, newPointer); + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == unOccupiedPages); + } + + SECTION("one slot without touching the others.") + { + // this won't be touched: + accessBlock.create(accSerial, chunkSize); + auto originalSlots = accessBlock.getAvailableSlots(accSerial, chunkSize); + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == originalSlots + 1U); + } + + SECTION("no invalid pointer but throws instead.") + { +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + pointer = nullptr; + CHECK_THROWS( + accessBlock.destroy(accSerial, pointer), + std::runtime_error{"Attempted to destroy an invalid pointer!"}); +#endif // NDEBUG + } + + SECTION("pointer for larger than page size") + { + if(accessBlock.numPages() > 1U) + { + accessBlock.destroy(accSerial, pointer); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + + pointer = accessBlock.create(accSerial, 2U * pageSize); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 2); + REQUIRE(accessBlock.isValid(accSerial, pointer)); + + accessBlock.destroy(accSerial, pointer); + + SECTION("thereby invalidating it.") + { + CHECK(not accessBlock.isValid(accSerial, pointer)); + } + + SECTION("thereby freeing up their pages.") + { + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + } + } + } + + SECTION("and doesn't reset the page.") + { + auto& unresettingAccessBlock = *reinterpret_cast< + TestableAccessBlock, AlignmentPolicy>*>( + &accessBlock); + auto const differentChunkSize = GENERATE(17, 2048); + REQUIRE(differentChunkSize != chunkSize); + auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); + + unresettingAccessBlock.destroy(accSerial, pointer); + CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); + } + + SECTION("and always resets the page for larger than page size.") + { + auto& unresettingAccessBlock = *reinterpret_cast< + TestableAccessBlock, AlignmentPolicy>*>( + &accessBlock); + auto const differentChunkSize = GENERATE(17, 2048); + auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); + auto* largePointer = accessBlock.create(accSerial, pageSize); + if(largePointer != nullptr) + { + REQUIRE(differentChunkSize != chunkSize); + + unresettingAccessBlock.destroy(accSerial, largePointer); + CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); + } + } + } +} + +TEST_CASE("AccessBlock (Regression)") +{ + SECTION("Mutable lambda") + { + // We had reason to believe that the read-in chunk size in `thisPageIsSuitable` might not have been propagated + // up to `createChunk` correctly. This would have led to the page being interpreted as having the wrong chunk + // size in cases with a waste factor. The following scenario excludes this: + // + // Assume that we have one page with a chunk size such that it contains exactly one bit mask full of chunks and + // fill this with allocations. We write ones to every bit in those allocations, so every single bit on this + // page is set to one (because the mask is also completely filled). + // + // Now, we deallocate the pointer to the very first chunk. This unsets exactly one bit, namely the first of the + // bit mask. Next, we request one allocation with a slightly smaller chunk size such that we would need one + // more bit from a second bit mask and fill this allocation with zeros. + // + // The CORRECT behaviour is as follows: + // - For the smaller allocation, the page still gets interpreted with the larger chunk size such that it + // correctly interprets itself to have only a single bit mask. + // - Looking for a free chunk, it therefore finds the first bit to be unset and (re-)allocates the first chunk + // for the slightly smaller chunk size. + // - There is no overlap of allocations and all writes are independent. We expect all allocations of the larger + // chunk size to contain only set bits and the allocation with the smaller chunk size to only contain unset + // bits. + // - The pointer for the smaller-size allocation points to the same location as the freed pointer. + // + // The suspected WRONG behaviour could have been as follows: + // - For the smaller allocation, the page gets wrongly interpreted with the SMALLER chunk size such that it + // wrongly assumes itself to host two bit masks. + // - Looking for a free chunk, the "first bit mask"" is completely set because it is actually the data region + // of the last chunk and we have set all bits in there. But what is considered as the "second bit mask" (but is + // actually the first and only one) has a free bit in the first position (which is the only allowed bit in that + // mask). Hence, the allocation takes place near the end of the page in the last chunk. + // - This allocation overlaps with a previous allocation and the original data get corrupted when we write our + // zeros. + // + // The wrong behaviour was not actually observed but excluded during debugging via this test. + + using namespace mallocMC::CreationPolicies::FlatterScatterAlloc; + + // For this to fail we would need a waste factor: + constexpr uint32_t wastefactor = 2U; + + // Fits exactly onto one bit mask: + constexpr uint32_t numChunksOneMask = BitMaskSize; + // Needs a second bit mask: + constexpr uint32_t numChunksTwoMasks = BitMaskSize + 1; + + // It was a bit of fiddling around to get these to do exactly what we want. + constexpr uint32_t chunkSizeTwoMasks = 2 * numChunksOneMask; + constexpr uint32_t chunkSizeOneMask = chunkSizeTwoMasks + (sizeof(BitMask) - 1); + + // This data Size is suited to fit both numbers of chunks: + constexpr uint32_t dataSize = numChunksOneMask * chunkSizeOneMask * AlignmentPolicy::Properties::dataAlignment; + + // Don't forget to add some room for the bit mask: + constexpr uint32_t pageSize = dataSize + sizeof(BitMask); + + // And the page table, also don't forget the page table. + using AccessBlock + = TestableAccessBlock, AlignmentPolicy>; + + AccessBlock accessBlock{accSerial}; + + REQUIRE(accessBlock.getAvailableSlots(accSerial, chunkSizeOneMask) == numChunksOneMask); + REQUIRE(accessBlock.getAvailableSlots(accSerial, chunkSizeTwoMasks) == numChunksTwoMasks); + + auto pointers = fillWith(accessBlock, chunkSizeOneMask); + + // Fill all memory with ones. + for(void* pointer : pointers) + { + auto mem = std::span(static_cast(pointer), chunkSizeOneMask); + for(auto& byte : mem) + { + byte = std::numeric_limits::max(); + } + } + + // Free up the pointer to the first chunk. + std::ranges::sort(pointers); + auto freedPointer = pointers[0]; + accessBlock.destroy(accSerial, freedPointer); + + void* pointerTwoMasks = accessBlock.create(accSerial, chunkSizeTwoMasks); + for(auto& c : std::span(static_cast(pointerTwoMasks), chunkSizeTwoMasks)) + { + c = 0U; + } + + // Check for corrupted memory. We might have written zeros here: + for(void* pointer : pointers) + { + if(pointer != freedPointer) + { + auto mem = std::span(static_cast(pointer), chunkSizeOneMask); + CHECK(std::all_of( + mem.begin(), + mem.end(), + [](auto const val) + { return val == std::numeric_limits>::max(); })); + } + } + + auto mem = std::span(static_cast(pointerTwoMasks), chunkSizeTwoMasks); + CHECK(std::all_of(mem.begin(), mem.end(), [](auto const val) { return val == 0U; })); + + // Now, we want to be really explicit: + CHECK(pointerTwoMasks == freedPointer); + + // This is a general check but should pass even in case of failure of the previous because only the filling + // level is considered here: + CHECK(accessBlock.getAvailableSlots(accSerial, AlignmentPolicy::Properties::dataAlignment) == 0U); + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSizeOneMask) == 0U); + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSizeTwoMasks) == 0U); + } +} diff --git a/test/unit/source/Allocator.cpp b/test/unit/source/Allocator.cpp new file mode 100644 index 0000000000..59aba16950 --- /dev/null +++ b/test/unit/source/Allocator.cpp @@ -0,0 +1,62 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2025 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/allocator.hpp" + +#include "mallocMC/alignmentPolicies/Shrink.hpp" +#include "mallocMC/creationPolicies/FlatterScatter.hpp" +#include "mallocMC/distributionPolicies/Noop.hpp" +#include "mallocMC/oOMPolicies/ReturnNull.hpp" +#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp" + +#include + +#include +#include +using Dim = alpaka::DimInt<1>; +using Idx = std::size_t; + +// Define the device accelerator +using Acc = alpaka::ExampleDefaultAcc; + +TEST_CASE("Allocator") +{ + SECTION("can be initialised with 0 memory.") + { + auto const platform = alpaka::Platform{}; + auto const dev = alpaka::getDevByIdx(platform, 0); + auto queue = alpaka::Queue{dev}; + + mallocMC::Allocator< + alpaka::AccToTag, + mallocMC::CreationPolicies::FlatterScatter<>, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::AlpakaBuf, + mallocMC::AlignmentPolicies::Shrink<>> + allocator{dev, queue, 0}; + } +} diff --git a/test/unit/source/BitField.cpp b/test/unit/source/BitField.cpp new file mode 100644 index 0000000000..e791289156 --- /dev/null +++ b/test/unit/source/BitField.cpp @@ -0,0 +1,247 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitFieldFlatImpl; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskImpl; + +using BitMaskSizes = std::tuple< + std::integral_constant, // NOLINT(*magic-number*) + std::integral_constant, // NOLINT(*magic-number*) + std::integral_constant>; // NOLINT(*magic-number*) + +TEMPLATE_LIST_TEST_CASE("BitMask", "", BitMaskSizes) +{ + constexpr uint32_t const BitMaskSize = TestType::value; + using BitMask = BitMaskImpl; + BitMask mask{}; + + SECTION("is initialised to 0.") + { + CHECK(mask == 0U); + } + + SECTION("can have individual bits read.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + CHECK(mask(accSerial, i) == false); + } + } + + SECTION("allows to write individual bits.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + mask.set(accSerial, i); + CHECK(mask(accSerial, i)); + } + } + + SECTION("allows to unset individual bits afterwards.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + mask.set(accSerial, i); + for(uint32_t j = 0; j < BitMaskSize; ++j) + { + CHECK(mask(accSerial, j) == (i == j)); + } + mask.unset(accSerial, i); + } + } + + + SECTION("knows the first free bit.") + { + mask.flip(accSerial); + uint32_t const index = GENERATE(0, 3); + mask.flip(accSerial, index); + CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == index); + } + + SECTION("returns BitMaskSize as first free bit if there is none.") + { + mask.flip(accSerial); + CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == BitMaskSize); + } + + SECTION("knows the first free bit with startIndex.") + { + mask.set(accSerial); + uint32_t index1 = GENERATE(0, 5); + uint32_t index2 = GENERATE(0, 11); + if(index1 > index2) + { + std::swap(index1, index2); + } + uint32_t const startIndex = GENERATE(0, 4, 5, 6); + mask.unset(accSerial, index1); + mask.unset(accSerial, index2); + // This is the currently implemented algorithm and could be considered overspecifying the result. + // The minimal requirement we should have is that firstFreeBit is an element of {index1, index2}. + CHECK(mask.firstFreeBit(accSerial, BitMaskSize, startIndex) == ((startIndex == index2) ? index2 : index1)); + } +} + +TEMPLATE_LIST_TEST_CASE("BitFieldFlat", "", BitMaskSizes) +{ + constexpr uint32_t const BitMaskSize = TestType::value; + using BitMask = BitMaskImpl; + using BitFieldFlat = BitFieldFlatImpl; + + // This is potentially larger than we actually need but that's okay: + constexpr uint32_t const numChunks = 256U; + constexpr uint32_t const numMasks = mallocMC::ceilingDivision(numChunks, BitMaskSize); + BitMask data[numMasks]; + + SECTION("knows its only free bit.") + { + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + for(auto& mask : data) + { + mask.set(accSerial); + } + data[index / BitMaskSize].unset(accSerial, index % BitMaskSize); + + // Just to be sure: The masks look as expected. + for(uint32_t j = 0; j < numMasks; ++j) + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + REQUIRE(data[j](accSerial, i) == (j * BitMaskSize + i != index)); + } + } + + BitFieldFlat field{data}; + + CHECK(field.firstFreeBit(accSerial, numChunks) == index); + } + + SECTION("knows a free bit if later ones are free, too.") + { + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + for(auto& mask : std::span{static_cast(data), index / BitMaskSize}) + { + mask.set(accSerial); + } + for(uint32_t i = 0; i < index % BitMaskSize; ++i) + { + data[index / BitMaskSize].set(accSerial, i); + } + + BitFieldFlat field{data}; + + CHECK(field.firstFreeBit(accSerial, numChunks) >= index); + } + + SECTION("knows its first free bit for different numChunks.") + { + auto localNumChunks = numChunks / GENERATE(1, 2, 3); + std::span localData{static_cast(data), mallocMC::ceilingDivision(localNumChunks, BitMaskSize)}; + uint32_t const index = GENERATE(0, 1, 10, 12); + for(auto& mask : localData) + { + mask.set(accSerial); + } + localData[index / BitMaskSize].unset(accSerial, index % BitMaskSize); + + BitFieldFlat field{localData}; + + CHECK(field.firstFreeBit(accSerial, numChunks) == index); + } + + SECTION("sets a bit.") + { + BitFieldFlat field{data}; + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + field.set(accSerial, index); + for(uint32_t i = 0; i < numChunks; ++i) + { + CHECK(field.get(accSerial, i) == (i == index)); + } + } + + SECTION("sets two bits.") + { + BitFieldFlat field{data}; + uint32_t const firstIndex = GENERATE(0, 1, numChunks / 2, numChunks - 1); + uint32_t const secondIndex = GENERATE(2, numChunks / 3, numChunks / 2, numChunks - 1); + field.set(accSerial, firstIndex); + field.set(accSerial, secondIndex); + for(uint32_t i = 0; i < numChunks; ++i) + { + CHECK(field.get(accSerial, i) == (i == firstIndex || i == secondIndex)); + } + } + + SECTION("returns numChunks if no free bit is found.") + { + BitFieldFlat field{data}; + for(uint32_t i = 0; i < numChunks; ++i) + { + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numChunks) == numChunks); + } + + SECTION("returns numChunks if free bit is not valid.") + { + BitFieldFlat field{data}; + uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); + for(uint32_t i = 0; i < numValidBits; ++i) + { + // We are filling up all valid bits. + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); + } + + SECTION("returns numChunks if free bit is not valid.") + { + BitFieldFlat field{data}; + uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); + for(uint32_t i = 0; i < numValidBits; ++i) + { + // We are filling up all valid bits. + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); + } +} diff --git a/test/unit/source/PageInterpretation.cpp b/test/unit/source/PageInterpretation.cpp new file mode 100644 index 0000000000..018269bf64 --- /dev/null +++ b/test/unit/source/PageInterpretation.cpp @@ -0,0 +1,316 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; +using mallocMC::CreationPolicies::FlatterScatterAlloc::DataPage; +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; +using std::distance; + +template +constexpr std::array const + chunkSizesForReportingTests{1, 2, 4, 5, 10, 11, 31, 32, 512}; // NOLINT(*magic-number*) + +template +constexpr std::array const expectedNumChunksForReportingTests{}; + +template<> +constexpr std::array const + expectedNumChunksForReportingTests<32U>{908, 480, 248, 199, 100, 92, 32, 31, 1}; // NOLINT(*magic-number*) + +template<> +constexpr std::array const + expectedNumChunksForReportingTests<64U>{904, 480, 248, 198, 100, 91, 32, 31, 1}; // NOLINT(*magic-number*) + +TEST_CASE("PageInterpretation") +{ + constexpr uint32_t const pageSize = 1024U; + constexpr uint32_t const chunkSize = 32U; + DataPage data{}; + PageInterpretation page{data, chunkSize}; + + SECTION("refers to the same data it was created with.") + { + CHECK(&data == page.chunkPointer(0)); + } + + SECTION("returns start of data as first chunk.") + { + CHECK(page.chunkPointer(0) == &data); + } + + SECTION("computes correct number of chunks.") + { + for(uint32_t i = 0U; i < chunkSizesForReportingTests.size(); ++i) + { + CHECK( + PageInterpretation::numChunks(chunkSizesForReportingTests[i]) + == expectedNumChunksForReportingTests[i]); + } + } + + SECTION("jumps by chunkSize between indices.") + { + for(auto i = 0U; i < (pageSize / chunkSize) - 1; ++i) + { + CHECK( + distance( + reinterpret_cast(page.chunkPointer(i)), + reinterpret_cast(page.chunkPointer(i + 1))) + == chunkSize); + } + } + + SECTION("knows the maximal bit field size.") + { + CHECK( + page.maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(1U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(32U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(16U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(17U), BitMaskSize) + * sizeof(BitMask)); + } + + SECTION("reports numChunks that fit the page.") + { + CHECK( + page.numChunks() * chunkSize + + static_cast(mallocMC::ceilingDivision(page.numChunks(), BitMaskSize) * sizeof(BitMask)) + <= pageSize); + } + + SECTION("knows correct bit field size.") + { + uint32_t const numChunks = GENERATE(2, BitMaskSize - 1, BitMaskSize, 2 * BitMaskSize); + uint32_t localChunkSize = pageSize / numChunks; + PageInterpretation localPage{data, localChunkSize}; + CHECK(localPage.bitFieldSize() == sizeof(BitMask) * mallocMC::ceilingDivision(numChunks, BitMaskSize)); + } +} + +TEST_CASE("PageInterpretation.create") +{ + // Such that we can fit up to four levels of hierarchy in there: + constexpr uint32_t const pageSize + = BitMaskSize * BitMaskSize * BitMaskSize + static_cast(BitMaskSize * sizeof(BitMask)); + // This might be a lot of memory up to a typical stack's size. Let's save us some trouble and create it on the + // heap. + auto actualData = std::make_unique>(); + DataPage& data{*actualData}; + + uint32_t numChunks = GENERATE(BitMaskSize, BitMaskSize * BitMaskSize); + // CAUTION: Only works for full bit masks: + uint32_t chunkSize = (pageSize - (numChunks / BitMaskSize) * sizeof(BitMask)) / numChunks; + PageInterpretation page{data, chunkSize}; + + SECTION("returns a pointer to within the data.") + { + auto* pointer = page.create(accSerial); + CHECK( + std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) + < std::distance( + reinterpret_cast(page.chunkPointer(0)), + reinterpret_cast(page.bitFieldStart()))); + } + + SECTION("returns a pointer to the start of a chunk.") + { + auto* pointer = page.create(accSerial); + CHECK( + std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) % chunkSize + == 0U); + } + + SECTION("returns nullptr if everything is full.") + { + for(auto& mask : page.bitField()) + { + mask.set(accSerial); + } + auto* pointer = page.create(accSerial); + CHECK(pointer == nullptr); + } + + SECTION("can provide numChunks pieces of memory and returns nullptr afterwards.") + { + for(uint32_t i = 0; i < page.numChunks(); ++i) + { + auto* pointer = page.create(accSerial); + CHECK(pointer != nullptr); + } + auto* pointer = page.create(accSerial); + CHECK(pointer == nullptr); + } + + SECTION("updates bit field.") + { + BitMask& mask{page.bitField().getMask(0)}; + REQUIRE(mask.none()); + auto* pointer = page.create(accSerial); + auto const index = page.chunkNumberOf(pointer); + CHECK(mask(accSerial, index)); + } +} + +TEST_CASE("PageInterpretation.destroy") +{ + // Such that we can fit up to four levels of hierarchy in there: + constexpr uint32_t const pageSize + = BitMaskSize * BitMaskSize * BitMaskSize * BitMaskSize + + BitMaskSize * BitMaskSize * BitMaskSize * static_cast(sizeof(BitMask)); + // This is more than 8MB which is a typical stack's size. Let's save us some trouble and create it on the heap. + std::unique_ptr> actualData{new DataPage}; + DataPage& data{*actualData}; + + uint32_t numChunks = GENERATE(BitMaskSize * BitMaskSize, BitMaskSize); + uint32_t chunkSize = pageSize / numChunks; + PageInterpretation page{data, chunkSize}; + auto* pointer = page.create(accSerial); + +#if (!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + SECTION("throws if given an invalid pointer.") + { + pointer = nullptr; + CHECK_THROWS( + page.destroy(accSerial, pointer), + throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " + "to a valid chunk or it is not marked as allocated."}); + } + + SECTION("allows pointers to anywhere in the chunk.") + { + // This test documents the state as is. We haven't defined this outcome as a requirement but if we change + // it, we might still want to be aware of this because users might want to be informed. + pointer = reinterpret_cast(reinterpret_cast(pointer) + chunkSize / 2); + CHECK_NOTHROW(page.destroy(accSerial, pointer)); + } +#endif // NDEBUG + + SECTION("only ever unsets (and never sets) bits in top-level bit mask.") + { + // We extract the position of the mask before destroying the pointer because technically speaking the whole + // concept of a mask doesn't apply anymore after that pointer was destroyed because that will automatically + // free the page. + auto mask = page.bitField().getMask(0); + auto value = mask; + page.destroy(accSerial, pointer); + CHECK(mask <= value); + } + + + SECTION("cleans up in bit field region of page") + { + // This is larger than any thread would be allowed to write. Threads would only write in the region up to + // `page.numChunks() * chunkSize` not up until `pageSize`. We still do that to have a better overview over + // what was actually deleted. + memset(std::begin(data.data), std::numeric_limits::max(), pageSize); + + uint32_t maxBitFieldSize = 0U; + uint32_t uncleanedSize = 0U; + SECTION("without explicit minimal chunk size") + { + maxBitFieldSize = page.maxBitFieldSize(); // NOLINT(*static*) + + SECTION("fully.") + { + uncleanedSize = 0U; + page.cleanupFull(); + } + + SECTION("only unused.") + { + uncleanedSize = page.bitFieldSize(); + page.cleanupUnused(); + } + } + + SECTION("with explicit minimal chunk size") + { + auto* localPage = reinterpret_cast*>(&page); // NOLINT(*magic-number*) + maxBitFieldSize = localPage->maxBitFieldSize(); // NOLINT(*static*) + + SECTION("fully.") + { + uncleanedSize = 0U; + localPage->cleanupFull(); + } + + SECTION("only unused.") + { + uncleanedSize = localPage->bitFieldSize(); + localPage->cleanupUnused(); + } + } + + for(uint32_t i = 0; i < pageSize; ++i) + { + CHECK( + data.data[i] + == ((i < pageSize - maxBitFieldSize) or (i >= pageSize - uncleanedSize) + ? std::numeric_limits::max() + : 0)); + } + } +} + +// NOLINTEND(*widening*) diff --git a/test/unit/source/PageTable.cpp b/test/unit/source/PageTable.cpp new file mode 100644 index 0000000000..b0ea806cd7 --- /dev/null +++ b/test/unit/source/PageTable.cpp @@ -0,0 +1,54 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageTable; + +constexpr uint32_t const numPages = 3; + +TEST_CASE("PageTable") +{ + PageTable pageTable{}; + + SECTION("initialises chunk sizes to 0.") + { + for(auto const& chunkSize : pageTable.chunkSizes) + { + CHECK(chunkSize == 0U); + } + } + + SECTION("initialises filling levels to 0.") + { + for(auto const& fillingLevel : pageTable.fillingLevels) + { + CHECK(fillingLevel == 0U); + } + } +} diff --git a/test/unit/source/mocks.hpp b/test/unit/source/mocks.hpp new file mode 100644 index 0000000000..b1764d1302 --- /dev/null +++ b/test/unit/source/mocks.hpp @@ -0,0 +1,76 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// This is very hacky: AccCpuSerial (and in general all Accellerators) are very reluctant to be instantiated, so we do +// it the oldschool way and simply malloc some memory pretending to be that accellerator. Let's hope that null-ing it +// is a valid initialisation. The final class only has one mutable data member, so that's probably not half bad but I +// didn't go through all those hundreds of base classes. Usually, we only need the time anyways. +inline auto constructAcc() +{ + using Acc = alpaka::AccCpuSerial, size_t>; + void* myPointer = malloc(sizeof(Acc)); + memset(myPointer, 0U, sizeof(Acc)); + return static_cast(myPointer); +} + +// +static inline auto const accPointer = constructAcc(); +static inline auto const& accSerial = *accPointer; + +template +struct HeapConfig +{ + static constexpr auto const accessblocksize = T_blockSize; + static constexpr auto const pagesize = T_pageSize; + static constexpr auto const wastefactor = T_wasteFactor; + static constexpr auto const resetfreedpages = T_resetfreedpages; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + return (chunkSize >= numBytes && chunkSize <= T_wasteFactor * numBytes); + } +}; + +struct AlignmentPolicy +{ + struct Properties + { + static constexpr uint32_t const dataAlignment = 1U; + }; +}; From dbbec816766e81440ec4617e9fed44506adaf3c6 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Mon, 10 Feb 2025 13:23:28 +0100 Subject: [PATCH 3/4] Use new mallocMC CMake infrastructure --- include/pmacc/PMaccConfig.cmake | 19 ++- thirdParty/cmake-modules/FindmallocMC.cmake | 158 -------------------- 2 files changed, 12 insertions(+), 165 deletions(-) delete mode 100644 thirdParty/cmake-modules/FindmallocMC.cmake diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake index 377ff06d22..44945db57f 100644 --- a/include/pmacc/PMaccConfig.cmake +++ b/include/pmacc/PMaccConfig.cmake @@ -97,6 +97,12 @@ set(_PMACC_MAX_ALPAKA_VERSION 1.2.0) # do not search for alpaka if it already exists # for example, a project that includes alpaka via add_subdirectory before including pmacc via add_subdirectory if(NOT TARGET alpaka::alpaka) + + # mallocMC needs this and it must be set before alpaka is added + if(alpaka_ACC_GPU_CUDA_ENABLE OR alpaka_ACC_GPU_HIP_ENABLE) + set(alpaka_INSTALL ON CACHE BOOL "" FORCE) + endif() + # the alpaka provider for the internal alpaka is only available, # if pmacc is used via add_subdirectory in another project # or examples are build @@ -378,18 +384,17 @@ endif() ################################################################################ if(alpaka_ACC_GPU_CUDA_ENABLE OR alpaka_ACC_GPU_HIP_ENABLE) - set(mallocMC_alpaka_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC") - find_package(mallocMC 2.6.0 QUIET) + if(PMACC_alpaka_PROVIDER STREQUAL "intern") + set(mallocMC_USE_alpaka "${PMacc_DIR}/../../thirdParty/alpaka" CACHE STRING "Select which alpaka is used for mallocMC") + endif() + find_package(mallocMC 3.0.0 QUIET) if(NOT mallocMC_FOUND) message(STATUS "Using mallocMC from thirdParty/ directory") - set(MALLOCMC_ROOT "${PMacc_DIR}/../../thirdParty/mallocMC") - find_package(mallocMC 2.6.0 REQUIRED) + add_subdirectory("${PMacc_DIR}/../../thirdParty/mallocMC" ${CMAKE_BINARY_DIR}/mallocMC) endif(NOT mallocMC_FOUND) - target_include_directories(pmacc PUBLIC ${mallocMC_INCLUDE_DIRS}) - target_link_libraries(pmacc PUBLIC ${mallocMC_LIBRARIES}) - target_compile_definitions(pmacc PUBLIC ${mallocMC_DEFINITIONS}) + target_link_libraries(pmacc PUBLIC mallocMC::mallocMC) endif() diff --git a/thirdParty/cmake-modules/FindmallocMC.cmake b/thirdParty/cmake-modules/FindmallocMC.cmake deleted file mode 100644 index 704aae5656..0000000000 --- a/thirdParty/cmake-modules/FindmallocMC.cmake +++ /dev/null @@ -1,158 +0,0 @@ -# - Find mallocMC library, -# Memory Allocator for Many Core Architectures -# https://github.com/ComputationalRadiationPhysics/mallocMC -# -# Use this module by invoking find_package with the form: -# find_package(mallocMC -# [version] [EXACT] # Minimum or EXACT version, e.g. 2.0.0 -# [REQUIRED] # Fail with an error if mallocMC or a required -# # component is not found -# [QUIET] # Do not warn if this module was not found -# ) -# -# To provide a hint to this module where to find the mallocMC installation, -# set the MALLOCMC_ROOT environment variable. You can also set the -# MALLOCMC_ROOT CMake variable, which will take precedence over the environment -# variable. Both hints are preferred over the DEFAULT_PATHS. -# -# This module requires CUDA and Boost. When calling it, make sure to call -# find_package(CUDA) and find_package(Boost) first. -# -# This module will define the following variables: -# mallocMC_INCLUDE_DIRS - Include directories for the mallocMC headers -# mallocMC_FOUND - TRUE if FindmallocMC found a working install -# mallocMC_VERSION - Version in format Major.Minor.Patch -# and for individual COMPONENTS: -# mallocMC__FOUND - TRUE if FindmallocMC found that component. -# The is written in original case, -# e.g., mallocMC_halloc_FOUND. -# -# The following variables are optional and only defined if the selected -# components require them: -# mallocMC_LIBRARIES - mallocMC libraries for dynamic linking using -# target_link_libraries(${mallocMC_LIBRARIES}) -# mallocMC_DEFINITIONS - Compiler definitions you should add with -# add_definitions(${mallocMC_DEFINITIONS}) -# - - -############################################################################### -# Copyright 2014-2015 Axel Huebl, Felix Schmitt, Rene Widera, -# Carlchristian Eckert -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER -# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, -# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE -# USE OR PERFORMANCE OF THIS SOFTWARE. -############################################################################### - - -# Required cmake version ###################################################### -# -cmake_minimum_required(VERSION 3.15.0) - - -# dependencies ################################################################ -# -set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used for mallocMC") -set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern") -mark_as_advanced(mallocMC_ALPAKA_PROVIDER) - -find_package(Boost 1.65.1 REQUIRED) - -# find mallocMC installation ################################################## -# -find_path(mallocMC_ROOT_DIR - NAMES include/mallocMC/mallocMC.hpp - PATHS ${MALLOCMC_ROOT} ENV MALLOCMC_ROOT - PATH_SUFFIXES "src" - DOC "mallocMC ROOT location" - NO_DEFAULT_PATH -) -find_path(mallocMC_ROOT_DIR - NAMES include/mallocMC/mallocMC.hpp - PATH_SUFFIXES "src" - DOC "mallocMC ROOT location" -) - -set(mallocMC_REQUIRED_VARS_LIST mallocMC_ROOT_DIR mallocMC_INCLUDE_DIRS) -mark_as_advanced(mallocMC_ROOT_DIR) - -if(mallocMC_ROOT_DIR) - # if alpaka is already available do not search again for the dependency - if(NOT TARGET alpaka::alpaka) - if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern") - set(alpaka_BUILD_EXAMPLES OFF) - set(BUILD_TESTING OFF) - add_subdirectory(${mallocMC_ROOT_DIR}/../alpaka ${CMAKE_BINARY_DIR}/alpaka) - else() - find_package(alpaka HINTS $ENV{ALPAKA_ROOT}) - endif() - endif() - - # find version ############################################################## - # - # to do: regex me - file(STRINGS "${mallocMC_ROOT_DIR}/include/mallocMC/version.hpp" - mallocMC_VERSION_MAJOR_HPP REGEX "#define MALLOCMC_VERSION_MAJOR ") - file(STRINGS "${mallocMC_ROOT_DIR}/include/mallocMC/version.hpp" - mallocMC_VERSION_MINOR_HPP REGEX "#define MALLOCMC_VERSION_MINOR ") - file(STRINGS "${mallocMC_ROOT_DIR}/include/mallocMC/version.hpp" - mallocMC_VERSION_PATCH_HPP REGEX "#define MALLOCMC_VERSION_PATCH ") - string(REGEX MATCH "([0-9]+)" mallocMC_VERSION_MAJOR - ${mallocMC_VERSION_MAJOR_HPP}) - string(REGEX MATCH "([0-9]+)" mallocMC_VERSION_MINOR - ${mallocMC_VERSION_MINOR_HPP}) - string(REGEX MATCH "([0-9]+)" mallocMC_VERSION_PATCH - ${mallocMC_VERSION_PATCH_HPP}) - unset(mallocMC_VERSION_MAJOR_HPP) - unset(mallocMC_VERSION_MINOR_HPP) - unset(mallocMC_VERSION_PATCH_HPP) - - # mallocMC variables ######################################################## - # - set(mallocMC_VERSION "${mallocMC_VERSION_MAJOR}.${mallocMC_VERSION_MINOR}.${mallocMC_VERSION_PATCH}") - unset(mallocMC_VERSION_MAJOR) - unset(mallocMC_VERSION_MINOR) - unset(mallocMC_VERSION_PATCH) - - set(mallocMC_INCLUDE_DIRS ${mallocMC_ROOT_DIR}/include) - -endif(mallocMC_ROOT_DIR) - - -# handles the REQUIRED, QUIET and version-related arguments for find_package ## -# -list(REMOVE_DUPLICATES mallocMC_REQUIRED_VARS_LIST) -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(mallocMC - FOUND_VAR mallocMC_FOUND - REQUIRED_VARS ${mallocMC_REQUIRED_VARS_LIST} - VERSION_VAR mallocMC_VERSION - HANDLE_COMPONENTS -) - -# clean up -# -# unset cached variables in case we did not find a valid install -# (e.g., we only found an outdated version) -if(NOT mallocMC_FOUND) - # default vars - unset(mallocMC_VERSION) - foreach(REQ_VAR ${mallocMC_REQUIRED_VARS_LIST}) - unset(${REQ_VAR}) - unset(${REQ_VAR} CACHE) - endforeach() - -endif() - -# always clean internal required vars list -unset(mallocMC_REQUIRED_VARS_LIST) - From 3f3b6057bc0f304f0e776db40501cb98acb8ca13 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 18 Feb 2025 11:02:29 +0100 Subject: [PATCH 4/4] Provide a tag instead of acc for mallocMC ci: full-compile --- include/picongpu/param/mallocMC.param | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/picongpu/param/mallocMC.param b/include/picongpu/param/mallocMC.param index 2e9867666e..36cccebc3e 100644 --- a/include/picongpu/param/mallocMC.param +++ b/include/picongpu/param/mallocMC.param @@ -59,7 +59,7 @@ namespace picongpu * algorithm. */ using DeviceHeap = mallocMC::Allocator< - pmacc::Acc, + alpaka::AccToTag>, mallocMC::CreationPolicies::Scatter, mallocMC::DistributionPolicies::Noop, mallocMC::OOMPolicies::ReturnNull,