diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000000..5893b3c2374 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +## Summary + +## Additional background + +## Checklist + +The proposed changes: +- [ ] fix a bug or incorrect behavior in AMReX +- [ ] add new capabilities to AMReX +- [ ] changes answers in the test suite to more than roundoff level +- [ ] are likely to significantly affect the results of downstream AMReX users +- [ ] are described in the proposed changes to the AMReX documentation, if appropriate diff --git a/.github/workflows/dependencies/dependencies_clang6.sh b/.github/workflows/dependencies/dependencies_clang6.sh new file mode 100755 index 00000000000..19b348b920b --- /dev/null +++ b/.github/workflows/dependencies/dependencies_clang6.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# +# Copyright 2020 The AMReX Community +# +# License: BSD-3-Clause-LBNL +# Authors: Axel Huebl + +set -eu -o pipefail + +sudo apt-get update + +sudo apt-get install -y \ + build-essential \ + clang gfortran diff --git a/.github/workflows/dependencies/dependencies_dpcpp.sh b/.github/workflows/dependencies/dependencies_dpcpp.sh new file mode 100755 index 00000000000..53f29b45084 --- /dev/null +++ b/.github/workflows/dependencies/dependencies_dpcpp.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# +# Copyright 2020 The AMReX Community +# +# License: BSD-3-Clause-LBNL +# Authors: Axel Huebl + +set -eu -o pipefail + +# Ref.: https://github.com/rscohn2/oneapi-ci +# intel-basekit intel-hpckit are too large in size +wget -q -O - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB \ + | sudo apt-key add - +echo "deb https://apt.repos.intel.com/oneapi all main" \ + | sudo tee /etc/apt/sources.list.d/oneAPI.list + +sudo apt-get update + +sudo apt-get install -y --no-install-recommends \ + build-essential \ + intel-oneapi-dpcpp-cpp-compiler intel-oneapi-mkl-devel \ + g++ gfortran \ + libopenmpi-dev \ + openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh new file mode 100755 index 00000000000..3c120487d69 --- /dev/null +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# +# Copyright 2020 The AMReX Community +# +# License: BSD-3-Clause-LBNL +# Authors: Axel Huebl + +# search recursive inside a folder if a file contains tabs +# +# @result 0 if no files are found, else 1 +# + +set -eu -o pipefail + +# Ref.: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu +wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key \ + | sudo apt-key add - +echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main' \ + | sudo tee /etc/apt/sources.list.d/rocm.list + +echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin' \ + | sudo tee -a /etc/profile.d/rocm.sh +# we should not need to export HIP_PATH=/opt/rocm/hip with those installs + +sudo apt-get update + +# Ref.: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#installing-development-packages-for-cross-compilation +# meta-package: rocm-dkms +# OpenCL: rocm-opencl +# other: rocm-dev rocm-utils +sudo apt-get install -y --no-install-recommends \ + build-essential \ + gfortran \ + libnuma-dev \ + libopenmpi-dev \ + openmpi-bin \ + rocm-dev rocrand + +# activate +# +source /etc/profile.d/rocm.sh +hipcc --version + +# cmake-easyinstall +# +sudo curl -L -o /usr/local/bin/cmake-easyinstall https://git.io/JvLxY +sudo chmod a+x /usr/local/bin/cmake-easyinstall +export CEI_SUDO="sudo" diff --git a/.github/workflows/dependencies/dependencies_mac.sh b/.github/workflows/dependencies/dependencies_mac.sh index 7fa6878dbb9..6226a4baadd 100755 --- a/.github/workflows/dependencies/dependencies_mac.sh +++ b/.github/workflows/dependencies/dependencies_mac.sh @@ -7,6 +7,10 @@ set -eu -o pipefail +brew uninstall openssl@1.0.2t +brew uninstall python@2.7.17 +brew untap local/openssl +brew untap local/python2 brew update brew install libomp brew install open-mpi diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 26ca812a498..e54fccad944 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -5,8 +5,9 @@ on: [push, pull_request] jobs: # Build and install libamrex as AMReX CMake project library: - name: GNU@7.5 C++17 [lib] + name: GNU@7.5 C++17 Release [lib] runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code"} steps: - uses: actions/checkout@v2 - name: Dependencies @@ -16,15 +17,44 @@ jobs: mkdir build cd build cmake .. \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex \ -DCMAKE_CXX_STANDARD=17 - make -j 2 VERBOSE=ON + make -j 2 + make install + + library_clang: + name: Clang@6.0 C++14 SP NOMPI Debug [lib] + runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code"} + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies_clang6.sh + - name: Build & Install + run: | + mkdir build + cd build + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex \ + -DAMReX_MPI=OFF \ + -DAMReX_PARTICLES=ON \ + -DAMReX_PRECISION=SINGLE \ + -DAMReX_PARTICLES_PRECISION=SINGLE \ + -DCMAKE_CXX_STANDARD=14 \ + -DCMAKE_C_COMPILER=$(which clang) \ + -DCMAKE_CXX_COMPILER=$(which clang++) \ + -DCMAKE_Fortran_COMPILER=$(which gfortran) + make -j 2 make install # Build libamrex and all tutorials tutorials: name: GNU@7.5 C++14 [tutorials] runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code"} steps: - uses: actions/checkout@v2 - name: Dependencies @@ -33,13 +63,18 @@ jobs: run: | mkdir build cd build - cmake .. -DENABLE_TUTORIALS=ON - make -j 2 tutorials + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON + make -j 2 # Build libamrex and all tutorials tutorials_cxx20: name: GNU@10.1 C++20 [tutorials] runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wno-error=deprecated-declarations -Wshadow -Woverloaded-virtual -Wunreachable-code"} steps: - uses: actions/checkout@v2 - name: Dependencies @@ -48,18 +83,22 @@ jobs: run: | mkdir build cd build - cmake .. \ - -DENABLE_TUTORIALS=ON \ - -DCMAKE_CXX_STANDARD=20 \ + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON \ + -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_C_COMPILER=$(which gcc-10) \ -DCMAKE_CXX_COMPILER=$(which g++-10) \ -DCMAKE_Fortran_COMPILER=$(which gfortran-10) - make -j 2 tutorials + make -j 2 # Build libamrex and all tutorials w/o MPI tutorials-nonmpi: - name: GNU@7.5 C++14 non-MPI [tutorials] + name: GNU@7.5 C++14 NOMPI [tutorials] runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code"} steps: - uses: actions/checkout@v2 - name: Dependencies @@ -68,13 +107,19 @@ jobs: run: | mkdir build cd build - cmake .. -DENABLE_TUTORIALS=ON -DENABLE_MPI=OFF - make -j 2 tutorials + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_MPI=OFF \ + -DAMReX_PARTICLES=ON + make -j 2 # Build libamrex and all tutorials tutorials-nofortran: name: GNU@7.5 C++11 w/o Fortran [tutorials] runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code"} steps: - uses: actions/checkout@v2 - name: Dependencies @@ -83,15 +128,18 @@ jobs: run: | mkdir build cd build - cmake .. \ - -DENABLE_TUTORIALS=ON \ - -DENABLE_FORTRAN=OFF \ + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON \ + -DAMReX_FORTRAN=OFF \ -DCMAKE_CXX_STANDARD=11 - make -j 2 tutorials + make -j 2 # Build libamrex and all tutorials with CUDA tutorials-cuda: - name: CUDA@9.1.85 GNU@4.8.5 C++11 [tutorials] + name: CUDA@9.1.85 GNU@4.8.5 C++11 Release [tutorials] runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -102,11 +150,143 @@ jobs: mkdir build cd build cmake .. \ - -DENABLE_TUTORIALS=ON \ - -DENABLE_CUDA=ON \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON \ + -DAMReX_GPU_BACKEND=CUDA \ -DCMAKE_C_COMPILER=$(which gcc-4.8) \ -DCMAKE_CXX_COMPILER=$(which g++-4.8) \ -DCMAKE_CUDA_HOST_COMPILER=$(which g++-4.8) \ -DCMAKE_Fortran_COMPILER=$(which gfortran-4.8) \ - -DCUDA_ARCH=6.0 - make -j 2 tutorials + -DAMReX_CUDA_ARCH=6.0 + make -j 2 + + tutorials-dpcpp: + name: DPCPP@PubBeta GFortran@7.5 C++17 [tutorials] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies_dpcpp.sh + - name: Build & Install + run: | + set +e + source /opt/intel/oneapi/setvars.sh + set -e + mkdir build + cd build + cmake .. \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON \ + -DAMReX_GPU_BACKEND=SYCL \ + -DCMAKE_C_COMPILER=$(which clang) \ + -DCMAKE_CXX_COMPILER=$(which dpcpp) \ + -DCMAKE_Fortran_COMPILER=$(which gfortran) + make -j 2 + + tutorials-hip: + name: HIP ROCm@3.8 GFortran@9.3 C++17 [tutorials] + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies_hip.sh + - name: Build & Install + run: | + source /etc/profile.d/rocm.sh + hipcc --version + mkdir build + cd build + cmake .. \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON \ + -DAMReX_FORTRAN=ON \ + -DAMReX_LINEAR_SOLVERS=ON \ + -DAMReX_GPU_BACKEND=HIP \ + -DAMReX_AMD_ARCH=gfx900 \ + -DCMAKE_C_COMPILER=$(which hipcc) \ + -DCMAKE_CXX_COMPILER=$(which hipcc) \ + -DCMAKE_Fortran_COMPILER=$(which gfortran) + make -j 2 + + # Build 1D libamrex with configure + configure-1d: + name: GNU@7.5 Release [configure 1D] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies.sh + - name: Build & Install + run: | + ./configure --dim 1 + make -j2 + make install + + # Build 2D libamrex with configure + configure-2d: + name: Clang@6.0 NOMPI Release [configure 2D] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies_clang6.sh + - name: Build & Install + run: | + ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no + make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE + make install + + # Build 3D libamrex with configure + configure-3d: + name: GNU@7.5 Release [configure 3D] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies.sh + - name: Build & Install + run: | + ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes + make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE + make install + + # Build 3D libamrex debug omp build with configure + configure-3d-omp-debug: + name: GNU@7.5 OMP Debug [configure 3D] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies.sh + - name: Build & Install + run: | + ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --with-omp yes --debug yes + make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE + make install + + # Build libamrex and run all tests + tests: + name: GNU@7.5 C++14 [tests] + runs-on: ubuntu-latest + env: {CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code"} + steps: + - uses: actions/checkout@v2 + - name: Dependencies + run: .github/workflows/dependencies/dependencies.sh + - name: Build & Install + run: | + mkdir build + cd build + cmake .. \ + -DAMReX_OMP=ON \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_ENABLE_TESTS=ON \ + -DAMReX_PARTICLES=ON + make -j 2 + - name: Run tests + run: | + cd build + ctest --output-on-failure -R diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 71fe768e220..4a37ba1d906 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -2,6 +2,9 @@ name: macos on: [push, pull_request] +env: + CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code" + jobs: # Build libamrex and all tutorials tutorials-macos: @@ -15,5 +18,9 @@ jobs: run: | mkdir build cd build - cmake .. -DENABLE_TUTORIALS=ON - make -j 2 tutorials + cmake .. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_BUILD_TUTORIALS=ON \ + -DAMReX_PARTICLES=ON + make -j 2 diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 00000000000..f81564346e6 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,17 @@ +name: windows + +on: [push, pull_request] + +jobs: + # Build libamrex and all tutorials + tutorials: + name: MSVC C++17 w/o Fortran w/o MPI + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - name: Build & Install + run: | + mkdir build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DAMReX_BUILD_TUTORIALS=ON -DAMReX_FORTRAN=OFF -DAMReX_MPI=OFF + cmake --build . --config Debug diff --git a/.gitignore b/.gitignore index 47f16d489b0..a879e781eeb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ nohup.out *.exe *.exe.dSYM *~ +build/ tmp_build_dir/ d/ f/ diff --git a/CHANGES b/CHANGES index e7a7b3e4563..4b23a0ba44b 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,289 @@ +# 20.11 + + -- The default hypre interface in MLMG is now IJ matrix for both EB and + non-EB. Previously, it was semi-struct for non-EB. (#1492) + + -- USE_SAVE_TEMPS option in GNU Make system for HIP. (#1492) + + -- The device version of `Error`, `Abort` and `Warning` is no-op, unless + `NDEBUG` is defined. (#1492) + + -- Ascent: Use Default Name for Mesh Ghost (#1488) + + -- Add fill method to GpuArray (#1472) + + -- Add Gpu::Atomic::AddNoRet (#1469) + + -- Option to make the code NOT enforce solvability in MLMG for singular + problems. (#1471) + + -- CMake: improve HDF5 support (#1468) + + -- CMake: FindPETSc must not overwrite CMAKE_Fortran_FLAGS (#1464) + + -- CMake: prevent in-source builds (#1453) + + -- change char * to char const* so SWFFT code compiles with USE_CUDA=TRUE + (#1454) + + -- Fix when small_volfrac is used -- we need to adjust the data in cut + cell neighbors as well as regular neighbors. (#1451) + + -- Allow users to set a Geometry object for ParticleContainer + independently of the AmrCore / AmrLevel object. (#1446) + + -- HDF5 plotfile write bug fix and performance improvement (#1448) + + -- CMake: add HIP support (#1316) + + -- NodalProjector: use volume-weighted average down (#1444) + + -- Fix race conditions in EB interpolation from cell centers to faces + (#1443) + + -- Allow tagging value to vary by level (#1441) + + -- Hypre with overset (#1439) + + -- Hypre IJ interface: Enable access to additional solvers and + preconditioners available in Hypre (#1437) + + -- Fix CPU version of uninitailiedFillNImpl in PODVector (#1435) + + -- CMake: Fix missing compile time definitions for HYPRE/PETSc (#1436) + + -- Fix bug in EB extdir slopes (#1434) + + -- Make the id and cpu members of amrex_particle private, as they should + no longer be accessed directly. (#1433) + + -- Disable OpenMPI C++ binding in GNU make system (#1398) + + -- Fix a long standing bug in eb levelset (#1432) + +# 20.10 + + -- CMake: fix ENABLE_PROFPARSER and ENABLE_SENSEI options (#1428) + + -- Remove EB LSCore and levelset, and reimplement signed distance function + capability. (#1429 & #1425) + + -- Remove flags setCGVerbose and setCGMaxIter since those are more + accurately called setBottomVerbose and setBottomMaxIter. The + "setCG..." flags are misleading since they actually apply to non-CG + bottom solvers as well. (#1413) + + -- Remove GetPosition and SetPosition from ParIter. These were added for + WarpX, but they are not used any more. (#1408) + + -- Only check the requested MPI Threading level if AMReX also initialized + MPI. (#1406) + + -- DPCPP support for mulitple GPUs (#1392) + + -- Random Number Generation on Device (#1363) In order to support RNG on + device with DPC++, we have to change the API because DPC++ does not + support global device variables. + + -- Remove redundant particle tests and old/obsolete/non-test tests (#1389 + & #1381) + + -- Gpu kernel fusing is integrated in MFIter. (#1332) + + -- New runtime parameter, amrex.max_gpu_streams. (#1386) + + -- Switch to `use mpi` from `include 'mpif.h'` (#1385) + + -- Remove reliance on managed memory from + AMReX_ParticleCommunication.H/.cpp (#1380) + + -- CMake: re-organize tutorials (#1333) + + -- Implement Particle in a way that does not involve UB. (#1337) + + -- amrex::EB_average_down(): Fix indexing error. (#1360) + + -- Only add one copy of the ghost particle regardless of how many isects + we have (#1359) + + -- Add non-EB slopes and extend the generality of EB slope routines in 2D + and 3D (#1350) + + -- Async IO: Check Runtime Threading (#1351) + + -- Changes to make AMR codes compiled with EB but not building any + Geometry work as if it is all regular. (#1349) + + -- Fix mkconfig.py (#1344) + + -- Add -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored to + suppress spurious warnings from nvcc. (#1342) + + -- EB extend_domain_face (#1321) + + -- Add parameter that stores OpenMP support version (#1325) + +# 20.09 + + -- Fix interpolation coefficients for EB stencil in cell-centered linear + solvers when the EB surface at the domain boundary is tilted. + + -- Add a parameter to `AMReX_omp_mod` that can be used by Fortran code to + check whether AMReX was built with OpenMP support, and, if yes, what + the version number (_OPENMP) was at the time AMReX was compiled. + + -- CMake: re-write genex evaluation functions. + + -- Extend the number of unique particles per cpu we can have at once. + + -- Add a new method to the `amrex_distromap` type in + F_Interfaces. `get_pmap` fills a caller-owned array of PEs. + + -- Fix a long standing bug in GNU make system on the use of HOST and + HOSTNAME. + + -- Add Scan::InclusiveSum and ExclusiveSum for CPU to avoid ifdef. + + -- Add check for empty probin_file in Amr::restart. + + -- Update Sundials interface, documentation, and build to be version + agnostic. + + -- Updates for ROCm 3.6 and 3.7. + + -- Add section to Make.unknown for intel mpi. + + -- CMake: re-factor third party libraries setup. + + -- Port TagBoxArray to GPU. + + -- Fix a bug in matching of COMP_VERSION to correctly treat CCE > 9. + + -- By default, EB outside the domain will be generated by extending from + the domain faces. + + -- Fix an issues due to roundoff error in determining whether particles + are inside the domain. + + -- Remove Perilla because it's incompatible with our GPU strategy and it's + no longer being maintained. + + -- Switch the overset mask convention to that 1 means unknown and 0 means + known. The new convention is more convenient for AMR-Wind. + + -- Remove old solvers, Src/LinearSolvers/C_CellMG and C_TensorMG. They + have been superseded by Src/LinearSolvers/MLMG. + + -- MacProjector: allow for re-use of the object and enhance multi-level + algorithm. + + -- Add asserts for bounds checking to Array1D and Array2D. + + -- Optimization of the construction of SFC. + + -- Refactoring of particle buffer map. + + -- Optimization of FPinfo and complementIn. + + -- Move the Amr/Extrapolater used in IAMR/PeleLM to C++/GPU compliant. + + -- Less surprising behavior for the ok() method of FabArray. This makes + the `ok()` method of FabArray return `false` instead of crashing if the + `define()` method has yet to be called. + + -- Optimization of regrid. + + -- Special FillBoundary for Almo. + + -- Fix the average_down_faces calls in linear solvers to see periodicity. + + -- Make sure the calculation of divu at inflow face in nodal projection + does not use tangential velocities on an inflow face. + +# 20.08 + + -- New average_down_faces functions that take periodic boundary into account. + + -- Update the documentation for Nsight. + + -- Fix a bug with HDF5 boxcenter data. + + -- New runtime parameter, `eb2.extend_domain_face`, that can be used to extraploate EB information + from domain faces. + + -- New GNU Make options for GCC and Clang, `WARN_ALL` and `WARN_ERROR`. `WARN_ALL` turns on more + compiler warnings. `WARN_ERROR` turns warnings into errors. + + -- Fix various compiler warnings. + + -- Fix memory fragmentation issue with communication buffer. + + -- Semi-coarsening support in linear solvers. + + -- Fix virtual particle issues. + + -- Capability to fuse GPU kernel launches. + + -- Make ghost particles support soa data. + + -- CMake: ability to export build tree. + + -- Reset amrex_geometry_module initialization state when finalizing. + + -- Remove Long version of Gatherv. + + -- Optimization of TagBoxArray::collate. + + -- CMake: fix missing PETSc setup in Config file. + + -- BndryFunc for nodal data. + + -- MLMG Hypre fix for singular problems. + + -- Add particle copy filters. + + -- Async output support for particles. + + -- BackgroundThread class. + +# 20.07 + + -- Nodal hypre solver now supports coarsening before hypre is called. + + -- A new ParmParse parameter, `amrex.the_arena_is_managed` is introduced to set the default memory + type of `The_Arena`. + + -- MFIter is now OMP thread safe when it is compiled with GPU support. + + -- Overset support is added to cell-center and tensor linear solvers. + + -- Remove compile time flag, BACKTRACE, for simplicity. + + -- Updates on Blueprint and Ascent. + + -- HDF5 support in GNU Make and CMake. + + -- CMake support for DPC++. + + -- Add -DNDEBUG for non-debug build. + + -- Support for Windows. + + -- Support for ParallelContext in Particles. + + -- Non const iterator for neighbor list. + + -- Tool for comparing single-level plotfiles of all nodalities. + + -- GNU Make: Use -M instead the perl script to generate dependencies. + + -- Particle: Remove directly accessing the m_idata and m_rdata structs internally. + + -- Extend the makeSFC method to take an additional argument where the user can override the number + of processes a BoxArray is distributed to. + + -- Turn tiling off by default for gpu. + # 20.06 -- Set a non-default CMAKE_INSTALL_PREFIX only if AMReX is the @@ -43,7 +329,7 @@ -- Add CUPTI trace for CUDA kernel timing. - -- Implement ENABLE_FORTRAN (BL_NO_FORT) option in CMake. + -- Implement AMReX_FORTRAN (BL_NO_FORT) option in CMake. -- Fix CUDA build of tools. diff --git a/CMakeLists.txt b/CMakeLists.txt index 43b8020b660..8ffb19eb045 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,49 +1,28 @@ cmake_minimum_required(VERSION 3.14) -######################################################################## # -# Set variables for AMReX versioning +# Prevent in-source builds # -######################################################################## -find_package (Git QUIET) - -set( _tmp "" ) - -# Try to inquire software version from git -if ( EXISTS ${CMAKE_CURRENT_LIST_DIR}/.git AND ${GIT_FOUND} ) - execute_process ( COMMAND git describe --abbrev=12 --dirty --always --tags - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} - OUTPUT_VARIABLE _tmp ) - string( STRIP ${_tmp} _tmp ) - # filter invalid descriptions in shallow git clones - if (NOT _tmp MATCHES "^([0-9]+)\\.([0-9]+)(\\.([0-9]+))*(-.*)*$") - set( _tmp "") - endif () -endif() - -# Grep first line from file CHANGES if cannot find version from Git -if (NOT _tmp) - file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/CHANGES ALL_VERSIONS REGEX "#") - list(GET ALL_VERSIONS 0 _tmp) - string(REPLACE "#" "" _tmp "${_tmp}") - string(STRIP "${_tmp}" _tmp ) - set(_tmp "${_tmp}.0") +if (CMAKE_BINARY_DIR STREQUAL CMAKE_SOURCE_DIR) + message(FATAL_ERROR + "\nin-source builds are not allowed: " + "build directory cannot be in the source directory path!\n" + "You MUST remove the file ${CMAKE_BINARY_DIR}/CMakeCache.txt and " + " the directory ${CMAKE_BINARY_DIR}/CMakeFiles/ to be able to build again.") endif () -set( AMREX_GIT_VERSION "${_tmp}" CACHE INTERNAL "" ) -unset(_tmp) -# Package version is a modified form of AMREX_GIT_VERSION -if (AMREX_GIT_VERSION) - string(FIND "${AMREX_GIT_VERSION}" "-" _idx REVERSE) - string(SUBSTRING "${AMREX_GIT_VERSION}" 0 "${_idx}" _pkg_version ) - string(FIND "${_pkg_version}" "-" _idx REVERSE) - string(SUBSTRING "${_pkg_version}" 0 "${_idx}" _pkg_version ) - string(REPLACE "-" "." _pkg_version "${_pkg_version}") -endif () +# +# Set search path for AMReX-specific CMake modules +# +set( AMREX_CMAKE_MODULES_PATH "${CMAKE_CURRENT_LIST_DIR}/Tools/CMake" CACHE INTERNAL "" ) +set( CMAKE_MODULE_PATH ${AMREX_CMAKE_MODULES_PATH} ) -set( AMREX_PKG_VERSION "${_pkg_version}" CACHE INTERNAL "" ) -unset(_pkg_version) +# +# Retrieve amrex version +# +include( AMReX_Utils ) +get_amrex_version() ######################################################################## @@ -60,11 +39,6 @@ project( AMReX message(STATUS "CMake version: ${CMAKE_VERSION}") -# -# Load required modules -# -set( AMREX_CMAKE_MODULES_PATH "${CMAKE_CURRENT_LIST_DIR}/Tools/CMake" CACHE INTERNAL "" ) -set( CMAKE_MODULE_PATH ${AMREX_CMAKE_MODULES_PATH} ) # # Provide a default install directory @@ -74,28 +48,52 @@ if ( CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR AND CMAKE_INSTALL_PREFIX_INITI CACHE PATH "AMReX installation directory" FORCE) endif () +message(STATUS "AMReX installation directory: ${CMAKE_INSTALL_PREFIX}") + +# +# Check if CMAKE_BUILD_TYPE is given. If not, use default +# +if ( NOT CMAKE_BUILD_TYPE ) + set(CMAKE_CONFIGURATION_TYPES "Release;Debug;MinSizeRel;RelWithDebInfo") + set(CMAKE_BUILD_TYPE Release + CACHE STRING + "Choose the build type, e.g. Release, Debug, or RelWithDebInfo." FORCE) +else () + message(STATUS "Build type set by user to '${CMAKE_BUILD_TYPE}'.") +endif() + # # Include options, utilities and other stuff we need # -include( AMReX_Utils ) -include( AMReX_Options ) -include( AMReX_Machines ) +include( AMReXOptions ) # # Enable Fortran if requested # -if(ENABLE_FORTRAN) +if(AMReX_FORTRAN) enable_language(Fortran) endif () # # Enable CUDA if requested # -if (ENABLE_CUDA) +if (AMReX_CUDA) + # CMake 3.18+: CMAKE_CUDA_ARCHITECTURES + # https://cmake.org/cmake/help/latest/policy/CMP0104.html + if(POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif() + enable_language(CUDA) include(AMReX_SetupCUDA) endif () +# +# Check compiler version +# +set_mininum_cxx_compiler_version(GNU 4.8) +set_mininum_cxx_compiler_version(MSVC 19.23) + # # Set CMAKE__FLAGS_ if not already defined # @@ -109,19 +107,30 @@ add_subdirectory(Src) # # Tutorials and "test_install" target # -option(ENABLE_TUTORIALS "Enable Tutorials" NO) +option(AMReX_BUILD_TUTORIALS "Build tutorials" NO) -if (ENABLE_TUTORIALS) +if (AMReX_BUILD_TUTORIALS) + message(STATUS "Enabling Tutorials") add_subdirectory(Tutorials) endif () # # Plotfile tools # -option(ENABLE_PLOTFILE_TOOLS "Enable Plotfile tools" NO) +option(AMReX_PLOTFILE_TOOLS "Enable Plotfile tools" NO) -if (ENABLE_PLOTFILE_TOOLS) +if (AMReX_PLOTFILE_TOOLS) # If this get executed, it cannot be EXCLUDED_FROM_ALL # because it needs to get installed add_subdirectory(Tools/Plotfile) endif () + + +# +# Enable CTests +# +option(AMReX_ENABLE_TESTS "Enable CTest suite for AMReX" NO) +if (AMReX_ENABLE_TESTS) + enable_testing() + add_subdirectory(Tests) +endif () diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 399f577efb9..93eb0a64337 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,9 +8,6 @@ Development generally follows the following ideas: Nightly regression testing is used to ensure that no answers change (or if they do, that the changes were expected). - If a change is critical, we can cherry-pick the commit from - `development` to `master`. - * Bug fixes, questions and contributions of new features are welcome! * Bugs should be reported through GitHub issues @@ -33,15 +30,10 @@ Development generally follows the following ideas: distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form. - * On the first workday of each month, we perform a merge of - `development` into `master`. For this merge to take place, we - need to be passing the regression tests. - - To accommodate this need, we close the merge window into - `development` a few days before the merge day. While the merge - window is closed, only bug fixes should be pushed into - `development`. Once the merge from `development` -> `master` is - done, the merge window reopens. + * On the first workday of each month, we make a tagged release. The merge window into + `development` is closed a few days before the release day. While the merge window is closed, + only bug fixes should be merged into `development`. Once the release is done, the merge window + reopens. ## Git workflow @@ -79,9 +71,8 @@ git remote add upstream https://github.com/AMReX-Codes/amrex git remote set-url --push upstream https://github.com//amrex.git git fetch upstream -# We recommend setting your development and master branches to track the upstream ones instead of your fork: +# We recommend setting your development branch to track the upstream one instead of your fork: git branch -u upstream/development -git checkout -t -b master upstream/master ``` Now you are free to play with your fork (for additional information, you can visit the [Github fork help page](https://help.github.com/en/articles/fork-a-repo)). @@ -91,7 +82,7 @@ Now you are free to play with your fork (for additional information, you can vis > on your fork with > ``` > git checkout development -> git pull development +> git pull > ``` Make sure you are on the `development` branch with @@ -177,6 +168,57 @@ and you can delete the remote one on your fork with git push origin --delete ``` +Generally speaking, you want to follow the following rules. + + * Do not merge your branch for PR into your local `development` branch that tracks AMReX + `development` branch. Otherwise your local `development` branch will diverge from AMReX + `development` branch. + + * Do not commit in your `development` branch that tracks AMReX `development` branch. + + * Always create a new branch based off `development` branch for each pull request, unless you are + going to use git to fix it later. + +If you have accidentally committed in `development` branch, you can fix it as follows, +``` +git checkout -b new_branch +git checkout development +git reset HEAD~2 # Here 2 is the number of commits you have accidentally committed in development +git checkout . +``` +After this, the local `development` should be in sync with AMReX `development` and your recent +commits have been saved in `new_branch` branch. + +If for some reason your PR branch has diverged from AMReX, you can try to fix it as follows. Before +you try it, you should back up your code in case things might go wrong. +``` +git fetch upstream # assuming upstream is the remote name for the official amrex repo +git checkout -b xxx upstream/development # replace xxx with whatever name you like +git branch -D development +git checkout -b development upstream/development +git checkout xxx +git merge yyy # here yyy is your PR branch with unclean history +git rebase -i upstream/development +``` +You will see something like below in your editor, +``` +pick 7451d9d commit message a +pick c4c2459 commit message b +pick 6fj3g90 commit message c +``` +This now requires a bit of knowledge on what those commits are, which commits have been merged, +which commits are actually new. However, you should only see your only commits. So it should be +easy to figure out which commits have already been merged. Assuming the first two commits have been +merged, you can drop them by replace `pick` with `drop`, +``` +drop 7451d9d commit message a +drop c4c2459 commit message b +pick 6fj3g90 commit message c +``` +After saving and then exiting the editor, `git log` should show a clean history based on top of +`development` branch. You can also do `git diff yyy..xxx` to make sure nothing new was dropped. If +all goes well, you can submit a PR using `xxx` branch. +Don't worry, if something goes wrong during the rebase, you an always `git rebase --abort` and start over. ## Core Developers People who make a number of substantive contributions will be named diff --git a/Docs/Doxygen/doxygen.conf b/Docs/Doxygen/doxygen.conf index 1762f9005d3..93dc968056e 100644 --- a/Docs/Doxygen/doxygen.conf +++ b/Docs/Doxygen/doxygen.conf @@ -848,7 +848,7 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = ../../Src/F_Interfaces ../../Src/AmrTask +EXCLUDE = ../../Src/F_Interfaces # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -1298,7 +1298,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. diff --git a/Docs/Notes/DPCPPWishlist.md b/Docs/Notes/DPCPPWishlist.md index 0c26f5712e8..fdaf1806a08 100644 --- a/Docs/Notes/DPCPPWishlist.md +++ b/Docs/Notes/DPCPPWishlist.md @@ -1,97 +1,120 @@ # Critical -* Global variables. Could DPC++ support global variables and add +* [Feature Request] Global variables. Could DPC++ support global variables and add something similar to cudaMemcpyToSymbol? [oneAPI-spec issue #125](https://github.com/oneapi-src/oneAPI-spec/issues/125) -* Device API for random number generator. Currently we can only use +* [Feature Request] Device API for random number generator. Currently we can only use oneMKL's host API to generate random numbers. - [Intel oneAPI Base Toolkit Forum](https://software.intel.com/en-us/forums/intel-oneapi-base-toolkit/topic/856436) + [Intel oneAPI Base Toolkit Forum](https://software.intel.com/en-us/forums/intel-oneapi-base-toolkit/topic/856436), [oneAPI-spec issue #139](https://github.com/oneapi-src/oneAPI-spec/issues/139) -* Recursive function call on device. This is very important for ECP +* [Feature Request] Recursive function call on device. This is very important for ECP WarpX code. [oneAPI-spec issue #123](https://github.com/oneapi-src/oneAPI-spec/issues/123) + A test code is available at https://github.com/WeiqunZhang/dpcpp/tree/main/recursive -* Memory fence. Could DPC++ privode a memory fence function for the +* [Feature Request] Memory fence. Could DPC++ privode a memory fence function for the whole device (not just group)? Or is the CUDA distinction between `__threadfence` and `__thread_block` unnecessary for Intel GPUs? [oneAPI-spec issue #130](https://github.com/oneapi-src/oneAPI-spec/issues/130) -* The compiler has some troubles with some very big device functions + This has been partially resolved. SYCL 2020 has introduced `memory_scope::device` ordering. + +* [Bug] The compiler has some troubles with some very big device functions (e.g., `mlndlap_stencil_rap` in `Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H`). It hangs at JIT compilation. We have to disable GPU launch for these functions and - run them on CPU. + run them on CPU. + + This can be reproduced by the test code at + https://github.com/AMReX-Codes/amrex/blob/development/Tests/LinearSolvers/NodeEB/ + + ``` + make -j8 USE_DPCPP=TRUE XTRA_CPPFLAGS=-DAMREX_DPCPP_STENCIL_RAP_ON_GPU + ./main3d.dpcpp.TEST.ex inputs.rt.3d.y + ``` + + If this is compiled without `XTRA_CPPFLAGS=-DAMREX_DPCPP_STENCIL_RAP_ON_GPU`, it runs fine by + putting function `mlndlap_stencil_rap` on CPU. # Major -* The maximum size of kernel parameter is 1KB on current Intel GPUs. +* [Feature Request] The maximum size of kernel parameter is 1KB on current Intel GPUs. This is not sufficient for many of our kernels. -* Sometimes the JIT compilation will raise floating-point exception at +* [Bug] Sometimes the JIT compilation will raise floating-point exception at runtime. This forces us to disable floating-point exception signal - handling that we often rely on for debugging. + handling that we often rely on for debugging. A bug reproducer is + available at https://github.com/WeiqunZhang/dpcpp/tree/main/jitfpe -* Option to be less OOP. Could we have access to thread id, group id, +* [Feature Request] Option to be less OOP. Could we have access to thread id, group id, memory fence, barrier functions, etc. without using an nd_item like object? [oneAPI-spec issue #118](https://github.com/oneapi-src/oneAPI-spec/issues/118) -* Local memory. Could DPC++ support static local memory +* [Feature Request] Local memory. Could DPC++ support static local memory (e.g. something like CUDA `__shared__ a[256]`) and dynamic local memory (e.g., something like CUDA `extern __shared__ a[]` with the amount of memory specified at runtime during kernel launch) from anywhere in device code? [oneAPI-spec issue #126](https://github.com/oneapi-src/oneAPI-spec/issues/126) -# Minor +* [Feature Request] DPC++ does not work with ccache. + [intel/llvm issue #1797](https://github.com/intel/llvm/issues/1797) -* Compiler flag to make implicit capture of this pointer via `[=]` an - error. [Implicit capture of this pointer](http://eel.is/c++draft/depr#capture.this) - has been deprecated in C++ 20. For many codes, it's almost always a - bug when `this` is implicitly captured onto device via `[=]`. - [oneAPI-spec issue #127](https://github.com/oneapi-src/oneAPI-spec/issues/127) +# Minor -* Host callback. Could DPC++ support appending a host callback +* [Feature Request] Host callback. Could DPC++ support appending a host callback function to an ordered queue? [oneAPI-spec issue #124](https://github.com/oneapi-src/oneAPI-spec/issues/124) -* Subgroup size. Querying `sycl::info::device::sub_group_size` gives - several numbers. For example, we get 8, 16 and 32 for Gen9. We - would like to specify the sub group size and this feature is - supported. All three sizes seem to work except that subgroup - primitives such as `shuffle_down` do not work for all sizes. By try - and error, we have found that shuffle_down works for 16. Could - oneAPI provide a query function for returning the "primitive" - subgroup size? - [oneAPI-spec issue #118](https://github.com/oneapi-src/oneAPI-spec/issues/118) +* [Bug] Subgroup size. Querying `sycl::info::device::sub_group_size` gives several numbers. For + example, we get 8, 16 and 32 for Gen9. We would like to specify the sub group size and this + feature is supported. All three sizes seem to work except that subgroup primitives such as + `shuffle_down` do not work with size of 32. Could oneAPI provide a query function for returning + the "primitive" subgroup size? [oneAPI-spec issue #118](https://github.com/oneapi-src/oneAPI-spec/issues/118) + A bug reproducer is available at https://github.com/WeiqunZhang/dpcpp/tree/main/subgroupsize -* `assert(0)`. `assert(0)` when called on device does not throw any +* [Feature Request] `assert(0)`. `assert(0)` when called on device does not throw any errors or abort the run. Is it possible to make it abort or return an error code that can be checked on the host? In CUDA, the users can check an error code. [oneAPI-spec issue #128](https://github.com/oneapi-src/oneAPI-spec/issues/128) -* `sycl::abs`. `sycl::abs(int)` returns an `unsigned int` in contrast to +* [Defect] `sycl::abs`. `sycl::abs(int)` returns an `unsigned int` in contrast to `int std::abs(int)`. Currently `std::abs` does not work on device. If `std::abs` is made to work on device, could it return the same type as the C++ standard? [oneAPI-spec issue #129](https://github.com/oneapi-src/oneAPI-spec/issues/129) +* [Defect] When `dpcpp -M` is used to generate dependency for make file, the output is saved in a + file. But for most if not all other compilers (including Intel C/C++ compiler), the output + appears in stdout. If there are no particular reasons for this, could DPC++ compiler change the + behavior to what other compilers do. It will simplify our make build system. (This has been + reported to Intel via Intel Premier Support.) + # Resolved -* ~~Classes that are not standard layout. The current specification of +* [Feature Request] Classes that are not standard layout. The current specification of oneAPI does not support the capture of objects that are not standard - layout. This includes the following example,~~ + layout. This includes the following example, ``` class A {int a;}; class B {long B;}; class C : A, B {}; ``` - ~~AMReX has a data structure called GpuTuple that is built with a + AMReX has a data structure called GpuTuple that is built with a pattern like the example shown above. It works in CUDA, but not in - DPC++. We wish this requirement can be relaxed.~~ + DPC++. We wish this requirement can be relaxed. This restriction has been relaxed since beta5. + +* [Feature Request] Compiler flag to make implicit capture of this pointer via `[=]` an + error. [Implicit capture of this pointer](http://eel.is/c++draft/depr#capture.this) + has been deprecated in C++ 20. For many codes, it's almost always a + bug when `this` is implicitly captured onto device via `[=]`. + [oneAPI-spec issue #127](https://github.com/oneapi-src/oneAPI-spec/issues/127) + + This has been implemented in the intel/llvm github repo. diff --git a/Docs/Notes/Readme.backtrace b/Docs/Notes/Readme.backtrace deleted file mode 100644 index 478cc5f69cb..00000000000 --- a/Docs/Notes/Readme.backtrace +++ /dev/null @@ -1,45 +0,0 @@ -To help debugging, AMReX handles various signals in the C standard -library raised in the runs. This gives us a chance to print out more -information using Linux/Unix backtrace capability. The signals -include seg fault, interruption by the user (control-c), assertion -errors, and floating point exceptions (NaNs, divided by zero and -overflow). The handling of seg fault, assertion errors and -interruption by control-C are enabled by default. (Note that in C++ -AMReX, AMREX_ASSERT() is only on when compiled with DEBUG=TRUE or -USE_ASSERTION=TRUE.) The trapping of floating point exceptions is not -enabled by default unless the code is compiled with TEST=TRUE or -DEBUG=TRUE in C++ AMReX, or TEST=t or NDEBUG= in Fortran AMReX. For -C++ AMReX codes, one can also use runtime parameters to control the -handling of floating point exceptions: amrex.fpe_trap_invalid for -NaNs, amrex.fpe_trap_zero for division by zero and -amrex.fpe_trap_overflow for overflow. To more effectively trap the -use of uninitialized values, AMReX also initializes MulitFabs and -arrays allocated by bl_allocate to signaling NaNs when it is compiled -with TEST=TRUE or DEBUG=TRUE in C++ AMReX, or TEST=t or NDEBUG= in -Fortran AMReX. In C++ AMReX, one can also control the setting using -the runtime parameter, fab.init_snan. - -If it is compiled with BACKTRACE=TRUE, one can get more information -than the backtrace of the call stack by instrumenting the code. (This -is in C++ code only.) Here is an example. You know the line "Real -rho = state(cell,0);" is causing a segfault. You could add a print -statement before that. But it might print out thousands (or even -millions) of line before it hits the segfault. With BACKTRACE, you -could do - - #include - - #ifdef AMREX_BACKTRACING - std::ostringstream ss; - ss << "state.box() = " << state.box() << " cell = " << cell; - BL_BACKTRACE_PUSH(ss.str()); // PUSH takes std::string - #endif - Real rho = state(cell,0); // state is a Fab, and cell is an IntVect. - #ifdef AMREX_BACKTRACING - BL_BACKTRACE_POP(); // One can omit this line. In that case, - // there is an implicit POP when "PUSH" is - // out of scope. - #endif - -When it hits the segfault, you will only see the last pint out. - diff --git a/Docs/sphinx_documentation/make_api.py b/Docs/sphinx_documentation/make_api.py index bd3b36da5ec..27362696127 100644 --- a/Docs/sphinx_documentation/make_api.py +++ b/Docs/sphinx_documentation/make_api.py @@ -32,8 +32,8 @@ def generate_filelist(rootdir, outfile, output_data, subdir_prefix=""): # found a subdirectory - create a new _files.rst file and call # generate_filelist on the subdir - # ignore AmrTask and F_Interfaces - if subdir.lower() in ['amrtask', 'f_interfaces']: + # ignore F_Interfaces + if subdir.lower() in ['f_interfaces']: continue output_data += """{}_files diff --git a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst index 1e0c9cdf618..a61ce2464c4 100644 --- a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst +++ b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst @@ -26,10 +26,10 @@ in your GNUMakefile. If using cmake then set the following cmake flags :: - AMREX_ENABLE_TINY_PROFILE = ON - AMREX_ENABLE_BASE_PROFILE = OFF + AMReX_TINY_PROFILE = ON + AMReX_BASE_PROFILE = OFF -Note that if you set ``PROFILE = TRUE`` (or ``AMREX_ENABLE_BASE_PROFILE = +Note that if you set ``PROFILE = TRUE`` (or ``AMReX_BASE_PROFILE = ON``) then this will override the ``TINY_PROFILE`` flag and tiny profiling will be disabled. @@ -245,4 +245,3 @@ parser itself. It has been integrated into Amrvis for visual interpretation of the data allowing Amrvis to open the bl_prof database like a plotfile but with interfaces appropriate to profiling data. AMRProfParser and Amrvis can be run in parallel both interactively and in batch mode. - diff --git a/Docs/sphinx_documentation/source/AmrCore.rst b/Docs/sphinx_documentation/source/AmrCore.rst index 8bcd30e750e..53326901262 100644 --- a/Docs/sphinx_documentation/source/AmrCore.rst +++ b/Docs/sphinx_documentation/source/AmrCore.rst @@ -41,7 +41,7 @@ AmrCore Source Code: Details ============================ -Here we provide more information about the source code in ``amrex/Src/AmrCore.`` +Here we provide more information about the source code in ``amrex/Src/AmrCore``. AmrMesh and AmrCore ------------------- @@ -219,7 +219,7 @@ of using :cpp:`MultiFab::FillBoundary` and :cpp:`FillDomainBoundary()`. A :cpp:`FillPatchUtil` uses an :cpp:`Interpolator`. This is largely hidden from application codes. AMReX_Interpolater.cpp/H contains the virtual base class :cpp:`Interpolater`, which provides an interface for coarse-to-fine spatial interpolation operators. The fillpatch routines described -above require an Interpolater for FillPatchTwoLevels() +above require an Interpolater for FillPatchTwoLevels(). Within AMReX_Interpolater.cpp/H are the derived classes: - :cpp:`NodeBilinear` @@ -449,7 +449,7 @@ Here is a high-level pseudo-code of the flow of the program: AmrCoreAdv::MakeNewLevelFromScratch() /* allocate phi_old, phi_new, t_new, and flux registers */ initdata() // fill phi - } (while (finest_level < max_level); + } while (finest_level < max_level); } amr_core_adv.Evolve() loop over time steps { diff --git a/Docs/sphinx_documentation/source/AmrLevel.rst b/Docs/sphinx_documentation/source/AmrLevel.rst index 85dbcbf4404..de4180d25e3 100644 --- a/Docs/sphinx_documentation/source/AmrLevel.rst +++ b/Docs/sphinx_documentation/source/AmrLevel.rst @@ -125,7 +125,7 @@ cells, number of components, and the interlevel interpolation (See AMReX_Interpolator for various interpolation types. We also see how to specify physical boundary functions by providing a function (in this case, :cpp:`nullfill` since we are not using physical boundary conditions), where -:cpp:`nullfill` is defined in a fortran routine in the tutorial source code. +:cpp:`nullfill` is defined in a Fortran routine in the tutorial source code. Example: Advection_AmrLevel =========================== diff --git a/Docs/sphinx_documentation/source/AsyncIter.rst b/Docs/sphinx_documentation/source/AsyncIter.rst deleted file mode 100644 index 3742d629088..00000000000 --- a/Docs/sphinx_documentation/source/AsyncIter.rst +++ /dev/null @@ -1,95 +0,0 @@ -FillPatch Iterator -================== - -FillPatch is an important operation commonly used in AMReX applications. -This operation interpolates data in both space and time. -Communication between AMR levels may incur when FillPatch interpolates data from a coarse AMR level and stores the result on the next finer level. -This operation also results in communication within the same AMR level when the subcycling option is used, which requires data interpolation in time. - -We develop an asynchronous version of the FillPatch operation, called Asynchronous FillPatch Iterator. -Each iterator takes care of the communication with the previous and next subcycles at the same AMR level (time) and between the current and the next finer AMR levels (space and time). -The iterator first automatically prepares temporary data needed for these communication activities and the data connections (aka data paths or data dependencies) among them. - -Based on this setup, the programmer can design numerical solvers. -This work is fairly simple. -At a certain simulation time on an AMR level, the programmer can ask the runtime system which FABs have received sufficient data for advancing to the next time step. -Although the FillPatch operation can be handled independently by the communication handler of the runtime system, this operation requires some computations such as packing/unpacking and extrapolation. -The programmer has the freedom to dedicate a few threads from the pool of worker threads to parallelize those computations. -This design choice may help the runtime process FillPatch operations faster, but may slow down the main computation. -Thus, our advise to the programmer on using how many threads for the FillPatch is that it depends on the compute intensity of the actual workload. -If the simulation is memory-bandwidth or network-bandwidth bound, the programmer can get the benefit from sparing more threads for doing FillPatch. - -RegionGraph Iterator -==================== - -We can simplify the programming work further with a new abstraction called RegionGraph Iterator a.k.a RGIter. -This abstraction is a for loop (see the following code snippet), which can hide details of the asynchronous FillPatch Iterator in the init part and the graph traversal in the ++ operator. -The only job required from the programmer is to specify the computations on the data, and they can easily place these computations in the loop body. - -.. highlight:: c++ - -:: - - for (RGIter rgi(afpi_vec, upper_afpi_vec, ...); rgi.isValid(); ++rgi){ - int f = rgi.currentRegion; - ...//computation on FAB f - } - -The execution of RGIter is as follows. -Initially, an object of RGIter (i.e. rgi) is instantiated, taking vectors of FillPatch Iterators on the current and upper AMR levels as arguments (each element of the vector corresponds to a specific time). -Based on these arguments, a task dependency graph spanning two AMR levels will be established. -Next, isValid() asks the runtime system for FABs that have received all dependent data. -When there is such a FAB, the computations in the loop body can execute on the FAB's data. -When the computations on a FAB finish, the ++ operator is called. -We overload this operator to traverse to the next runnable FAB. - -Note: RGIter also supports data tiling. -Specifically, we overload the ++ operator so that it will traverse data tiles in a FAB before it goes to next FAB if the tiling flag in the FAB is enabled. -Instead of applying the computations in the loop body on the entire FAB, it executes them on a single tile at a time. - - -Generated Task Graph Code -========================= - -The real input to the runtime system is an AMR program containing task dependency graphs (or task graph for short). -Thus, the code written with the above asynchronous iterators will be transformed into a task graph form. -The definition of a task dependency graph is as follows. -Each task of a graph performs some computations on an FArrayBox (FAB). -Tasks are connected with each other via edges, denoting the dependency on data. -A task can be executed when all data dependencies have been satisfied. -The code snippet below queries runnable tasks of a task dependency graph named regionGraph. -Note that each task dependency graph is more or less a wrapper of a MultiFab. -In this example, a task of regionGraph computes the body code of the while loop to update the associated FAB. -Each task of this graph receives data arrived at the runtime system and injects the data into the associated FAB. -After updating FAB, it lets the runtime know about the change. -The runtime system uses AMR domain knowledge to establish data dependencies among tasks, and thus it can answer which tasks are runnable and how to update neighbor FABs when a current FAB changes. - -.. highlight:: c++ - -:: - - while(!regionGraph->isGraphEmpty()) - { - f = regionGraph->getAnyFireableRegion(); - multifabCopyPull(..., f, ...); //inject arrived dependent data into the fab, if any - syncWorkerThreads(); - ...//compute on the fab f of multifab associated with coarseRegionGraph - syncWorkerThreads(); - multifabCopyPush(..., f, ...); //tell the runtime that data of Fab f changed - regionGraph->finalizeRegion(f) - } - -The process of learning the domain knowledge is as follows. -At the beginning of the program, the runtime extracts the metadata needed for establishing data dependencies among tasks of the same graph or between two different graphs. -Every time the AMR grid hierarchy changes (i.e. when a few or all AMR levels regrid), the runtime re-extracts the metadata to correct the task dependency graphs. -Once the metadata extraction completes, the runtime system invokes the computation on AMR levels (e.g., timeStep, initTimeStep, and postTimeStep). - -Known Limitations -================= - -To realize enough task parallelism, the runtime system constructs a task dependency graph for the whole coarse time step and executes it asynchronously to the completion of the step. -As a result, any request to regrid an AMR level must be foreseen before the execution of a coarse time step. -If there is a regridding request during the graph execution, the runtime system simply ignores it. -In the future we may relax this constraint in the programming model. -However, such a support would come at a significant performance cost due to the required checkpointing and rollback activities. - diff --git a/Docs/sphinx_documentation/source/AsyncIter_Chapter.rst b/Docs/sphinx_documentation/source/AsyncIter_Chapter.rst deleted file mode 100644 index 446709442ef..00000000000 --- a/Docs/sphinx_documentation/source/AsyncIter_Chapter.rst +++ /dev/null @@ -1,49 +0,0 @@ -.. _Chap:AsyncIter: - -Asynchronous Iterators (AmrTask) -================================ - -Hiding communication overheads via overlapping communication with computation requires a sufficiently large amount of task parallelism. -This problem is even more challenging due to various types of tasks in an AMReX program, including data parallel tasks (same workload on different data partitions) and control parallel tasks (different types of workload). -This chapter introduces the API of AMReX's asynchronous iterators that can facilitate the job of identifying tasks in the applications. -We have developed two iterators called FillPatch and RegionGraph Iterators, which will be described later on in this chapter. -We first show how the programmer can use a runtime system to execute application codes written with these iterators. - -In ``amrex/Src/AmrTask/rts_impls``, we implement RTS - a runtime system that can execute asynchronous AMReX applications efficiently on large-scale systems. -RTS is a black box to the application developer as showed in the following code snippet, which is the main function of a typical AMReX application running asynchronously under the control of the runtime system. -The programmer first needs to use the namespace ``perilla``, which covers all the C++ classes for the runtime system. -To execute an AMR program (i.e. object of the Amr class), the programmer can simply create an object of RTS and pass the program object into the ``Iterate`` method. -The runtime system will iteratively execute coarse time steps until the program completes. -By default RTS links to MPI and Pthreads libraries. -The programmer can also switch to other backends such as UPCXX (1-sided communication model compared to the common 2-sided model in MPI) without changing the application source code. - -.. highlight:: c++ - -:: - - using namespace perilla; - int main (int argc, char* argv[]) - { - amrex::Initialize(argc,argv); - ... //set up program input, e.g. start_time, stop_time, max_step - Amr amr; - amr.init(start_time,stop_time); - RTS rts; - rts.Iterate(&amr, max_step, stop_time); - amrex::Finalize(); - return 0; - } - -In a few functions of the Amr class, the runtime exposes multiple threads per process. -As a result, the programmer needs to place sufficient memory protection for shared data within the process, e.g. when updating the state data. This multithreaded interface adds some programming cost, but is necessary for mitigating the task scheduling overhead. -To avoid these programming details, the programmer can use built-in iterators, such as fillpatch iterator and task graph iterator that we next discuss. -The API of these iterators is very simple, and the asynchronous code is very similar to the original code using the synchronous multifab iterator (MFIter) described earlier in chapter Basics. - - -.. toctree:: - :maxdepth: 1 - - AsyncIter - - - diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst index 15ace124b35..c1c8a30a9fc 100644 --- a/Docs/sphinx_documentation/source/Basics.rst +++ b/Docs/sphinx_documentation/source/Basics.rst @@ -38,7 +38,7 @@ It is used throughout AMReX, however its functions are not defined for device code. :cpp:`GpuArray` is AMReX's built-in alternative. It is a trivial type that works on both host and device. It also works when compiled just for CPU. Besides :cpp:`GpuArray`, AMReX also -provides GPU safe :cpp:`Array1D`, :cpp:`Array2D` and :cpp:`Array3d` that are +provides GPU safe :cpp:`Array1D`, :cpp:`Array2D` and :cpp:`Array3D` that are 1, 2 and 3-dimensional fixed size arrays, respectively. These three class templates can have non-zero based indexing. @@ -279,6 +279,34 @@ run with to change the value of :cpp:`ncells` and :cpp:`hydro.cfl`. +Sometimes an application code may want to set a default that differs from the +default in AMReX. In this case, it is often convenient to define a function that +sets the variable(s), and pass the name of that function to :cpp:`amrex::Initialize`. +As an example, we may define :cpp:`add_par` to set :cpp:`extend_domain_face` +to false if it hasn't already been set in the inputs file. + +.. highlight:: c++ + +:: + + void add_par () { + ParmParse pp("eb2"); + if(not pp.contains("extend_domain_face")) { + pp.add("extend_domain_face",false); + } + }; + +Then we would pass :cpp:`add_par` into :cpp:`amrex::Initialize`: + +.. highlight:: c++ + +:: + + amrex::Initialize(argc, argv, true, MPI_COMM_WORLD, add_par); + +This value replaces the current default value of true in AMReX itself, but +can still be over-written by setting a value in the inputs file. + .. _sec:basics:initialize: @@ -347,7 +375,7 @@ arguments. main2d*.exe inputs amrex.v=1 amrex.fpe_trap_invalid=1 -- -tao_monitor then AMReX will parse the inputs file and the optional AMReX's command -line arguments, but will ignore everything after "--". +line arguments, but will ignore everything after the double dashes. .. _sec:basics:amrgrids: @@ -624,8 +652,8 @@ the index type. Some examples are shown below. Print() << facebx.coarsen(2); // ((16,16,16) (32,31,31) (1,0,0)) Box uncoarsenable ({16,16,16}, {30,30,30}); - print() << uncoarsenable.coarsen(2); // ({8,8,8}, {15,15,15}); - print() << uncoarsenable.refine(2); // ({16,16,16}, {31,31,31}); + Print() << uncoarsenable.coarsen(2); // ((8,8,8), (15,15,15)); + Print() << uncoarsenable.refine(2); // ((16,16,16), (31,31,31)); // Different from the original! Note that the behavior of refinement and coarsening depends on the @@ -1259,7 +1287,7 @@ will be :cpp:`Box{(6,6,6) (16,16,16)}` in this example. For cells in :cpp:`FArrayBox`, we call those in the original :cpp:`Box` **valid cells** and the grown part **ghost cells**. Note that :cpp:`FArrayBox` itself does not have the concept of ghost cells. Ghost cells are a key concept of -:cpp:`MultiFab`,however, that allows for local operations on ghost cell data +:cpp:`MultiFab`, however, that allows for local operations on ghost cell data originated from remote processes. We will discuss how to fill ghost cells with data from valid cells later in this section. :cpp:`MultiFab` also has a default constructor. One can define an empty :cpp:`MultiFab` first and then @@ -1335,7 +1363,7 @@ face averaged variables. MultiFab zflux(amrex::convert(ba, IntVect{0,0,1}), dm, ncomp, 0); Here all :cpp:`MultiFab`\ s use the same :cpp:`DistributionMapping`, but their -:cpp:`BoxArrays` have different index types. The state is cell-based, whereas +:cpp:`BoxArray`\ s have different index types. The state is cell-based, whereas the fluxes are on the faces. Suppose the cell based :cpp:`BoxArray` contains a :cpp:`Box{(8,8,16), (15,15,31)}`. The state on that :cpp:`Box` is conceptually a Fortran Array with the dimension of :fortran:`(8:15,8:15,16:31,0:2)`. The @@ -2470,13 +2498,51 @@ Debugging Debugging is an art. Everyone has their own favorite method. Here we offer a few tips we have found to be useful. -Compiling in debug mode (e.g., ``make DEBUG=TRUE``) and running with -``ParmParse`` parameter ``amrex.fpe_trap_invalid=1`` can be helpful. -In debug mode, many compiler debugging flags are turned on and all -``MultiFab`` data are initialized to signaling NaNs. The -``amrex.fpe_trap_invalid`` parameter will result in backtrace files -when floating point exception occurs. One can then examine those -files to track down the origin of the issue. +To help debugging, AMReX handles various signals in the C standard +library raised in the runs. This gives us a chance to print out more +information using Linux/Unix backtrace capability. The signals +include seg fault, interruption by the user (control-c), assertion +errors, and floating point exceptions (NaNs, divided by zero and +overflow). The handling of seg fault, assertion errors and +interruption by control-C are enabled by default. Note that +``AMREX_ASSERT()`` is only on when compiled with ``DEBUG=TRUE`` or +``USE_ASSERTION=TRUE`` in GNU make, or with ``-DCMAKE_BUILD_TYPE=Debug`` or +``-DAMReX_ASSERTIONS=YES`` in CMake. The trapping of floating point exceptions is not +enabled by default unless the code is compiled with ``DEBUG=TRUE`` in GNU make, or with +``-DCMAKE_BUILD_TYPE=Debug`` or ``-DAMReX_FPE=YES`` in CMake to turn on compiler flags +if supported. Alternatively, one can always use runtime parameters to control the +handling of floating point exceptions: ``amrex.fpe_trap_invalid`` for +NaNs, ``amrex.fpe_trap_zero`` for division by zero and +``amrex.fpe_trap_overflow`` for overflow. To more effectively trap the +use of uninitialized values, AMReX also initializes ``FArrayBox``\ s in +``MulitFab``\ s and arrays allocated by ``bl_allocate`` to signaling NaNs when it is compiled +with ``TEST=TRUE`` or ``DEBUG=TRUE`` in GNU make, or with ``-DCMAKE_BUILD_TYPE=Debug`` in CMake. +One can also control the setting for ``FArrayBox`` using the runtime parameter, ``fab.init_snan``. + +One can get more information than the backtrace of the call stack by +instrumenting the code. Here is an example. +You know the line ``Real rho = state(cell,0);`` is causing a segfault. You +could add a print statement before that. But it might print out +thousands (or even millions) of line before it hits the segfault. What +you could do is the following, + +.. highlight:: c++ + +:: + + #include + + std::ostringstream ss; + ss << "state.box() = " << state.box() << " cell = " << cell; + BL_BACKTRACE_PUSH(ss.str()); // PUSH takes std::string + + Real rho = state(cell,0); // state is a Fab, and cell is an IntVect. + + BL_BACKTRACE_POP(); // One can omit this line. In that case, + // there is an implicit POP when "PUSH" is + // out of scope. + +When it hits the segfault, you will only see the last pint out. Writing a ``MultiFab`` to disk with @@ -2601,7 +2667,7 @@ domain, the physical coordinates of the box, and the periodicity: IntVect dom_hi(AMREX_D_DECL(n_cell-1, n_cell-1, n_cell-1)); Box domain(dom_lo, dom_hi); - // Initialize the boxarray "ba" from the single box "bx" + // Initialize the boxarray "ba" from the single box "domain" ba.define(domain); // Break up boxarray "ba" into chunks no larger than "max_grid_size" along a direction ba.maxSize(max_grid_size); @@ -2650,7 +2716,7 @@ We demonstrate how to build an array of face-based ``MultiFabs`` : flux[dir].define(edge_ba, dm, 1, 0); } -To access and/or modify data n a ``MultiFab`` we use the ``MFIter``, where each +To access and/or modify data in a ``MultiFab`` we use the ``MFIter``, where each processor loops over grids it owns to access and/or modify data on that grid: :: diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst index 32b9da1a48c..8731d2fbef8 100644 --- a/Docs/sphinx_documentation/source/BuildingAMReX.rst +++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst @@ -28,34 +28,40 @@ list of important variables. .. table:: Important make variables - +------------+-------------------------------------+--------------------+ - | Variable | Value | Default | - +============+=====================================+====================+ - | AMREX_HOME | Path to amrex | environment | - +------------+-------------------------------------+--------------------+ - | COMP | gnu, cray, ibm, intel, llvm, or pgi | none | - +------------+-------------------------------------+--------------------+ - | CXXSTD | C++ standard (``c++11``, ``c++14``) | compiler default, | - | | | at least ``c++11`` | - +------------+-------------------------------------+--------------------+ - | DEBUG | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | DIM | 1 or 2 or 3 | 3 | - +------------+-------------------------------------+--------------------+ - | PRECISION | DOUBLE or FLOAT | DOUBLE | - +------------+-------------------------------------+--------------------+ - | USE_MPI | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | USE_OMP | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | USE_CUDA | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | USE_HIP | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | USE_DPC++ | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ - | USE_RPATH | TRUE or FALSE | FALSE | - +------------+-------------------------------------+--------------------+ + +-----------------+-------------------------------------+--------------------+ + | Variable | Value | Default | + +=================+=====================================+====================+ + | AMREX_HOME | Path to amrex | environment | + +-----------------+-------------------------------------+--------------------+ + | COMP | gnu, cray, ibm, intel, llvm, or pgi | none | + +-----------------+-------------------------------------+--------------------+ + | CXXSTD | C++ standard (``c++11``, ``c++14``, | compiler default, | + | | ``c++17``, ``c++20``) | at least ``c++11`` | + +-----------------+-------------------------------------+--------------------+ + | DEBUG | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | DIM | 1 or 2 or 3 | 3 | + +-----------------+-------------------------------------+--------------------+ + | PRECISION | DOUBLE or FLOAT | DOUBLE | + +-----------------+-------------------------------------+--------------------+ + | USE_MPI | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | USE_OMP | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | USE_CUDA | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | USE_HIP | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | USE_DPCPP | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | USE_RPATH | TRUE or FALSE | FALSE | + +-----------------+-------------------------------------+--------------------+ + | WARN_ALL | TRUE or FALSE | TRUE for DEBUG | + | | | FALSE otherwise | + +-----------------+-------------------------------------+--------------------+ + | AMREX_CUDA_ARCH | CUDA arch such as 70 | 70 if not set | + | or CUDA_ARCH | | or detected | + +-----------------+-------------------------------------+--------------------+ .. raw:: latex @@ -92,12 +98,15 @@ One could set the ``DIM`` variable to either 1, 2, or 3, depending on the dimensionality of the problem. The default dimensionality is 3. AMReX uses double precision by default. One can change to single precision by setting ``PRECISION=FLOAT``. +(Particles have an equivalent flag ``USE_SINGLE_PRECISION_PARTICLES=TRUE/FALSE``.) Variables ``DEBUG``, ``USE_MPI`` and ``USE_OMP`` are optional with default set to FALSE. The meaning of these variables should be obvious. When ``DEBUG=TRUE``, aggressive compiler optimization flags are turned off and assertions in source code are turned on. For production runs, ``DEBUG`` should be set to FALSE. +An advanced variable, ``MPI_THREAD_MULTIPLE``, can be set to TRUE to initialize +MPI with support for concurrent MPI calls from multiple threads. Variables ``USE_CUDA``, ``USE_HIP`` and ``USE_DPCPP`` are used for targeting Nvidia, AMD and Intel GPUs, respectively. At most one of @@ -109,6 +118,16 @@ If enabled, the library path at link time will be saved as a When disabled, dynamic library paths could be provided via ``export LD_LIBRARY_PATH`` hints at runtime. +For GCC and Clang, the variable ``WARN_ALL`` controls the compiler's warning options. There is +also a make variable ``WARN_ERROR`` (with default of ``FALSE``) to turn warnings into errors. + +When ``USE_CUDA`` is ``TRUE``, the make system will try to detect what CUDA +arch should be used by running +``$(CUDA_HOME)/extras/demo_suite/deviceQuery`` if your computer is unknown. +If it fails to detect the CUDA arch, the default value of 70 will be used. +The user could override it by ``make USE_CUDA=TRUE CUDA_ARCH=80`` or ``make +USE_CUDA=TRUE AMREX_CUDA_ARCH=80``. + After defining these make variables, a number of files, ``Make.defs, Make.package`` and ``Make.rules``, are included in the GNUmakefile. AMReX-based applications do not need to include all directories in AMReX; an application @@ -373,15 +392,14 @@ For example, one can enable OpenMP support as follows: :: - cmake -DENABLE_OMP=YES -DCMAKE_INSTALL_PREFIX=/path/to/installdir /path/to/amrex + cmake -DAMReX_OMP=YES -DCMAKE_INSTALL_PREFIX=/path/to/installdir /path/to/amrex -In the example above ``=ENABLE_OMP`` and ``=YES``. +In the example above ``=AMReX_OMP`` and ``=YES``. Configuration variables requiring a boolen value are evaluated to true if they are assigned a value of ``1``, ``ON``, ``YES``, ``TRUE``, ``Y``. Conversely they are evaluated to false if they are assigned a value of ``0``, ``OFF``, ``NO``, ``FALSE``, ``N``. Boolean configuration variables are case-insensitive. -The list of available options is reported in the table on :ref:`tab:cmakevar` -below. +The list of available options is reported in the :ref:`table ` below. .. raw:: latex @@ -390,95 +408,95 @@ below. .. _tab:cmakevar: -.. table:: AMReX build options - - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | Variable Name | Description | Default | Possible values | - +==============================+=================================================+=============+=================+ - | CMAKE_Fortran_COMPILER | User-defined Fortran compiler | | user-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CMAKE_CXX_COMPILER | User-defined C++ compiler | | user-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CMAKE_Fortran_FLAGS | User-defined Fortran flags | | user-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CMAKE_CXX_FLAGS | User-defined C++ flags | | user-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CMAKE_CXX_STANDARD | C++ standard | compiler/11 | 11, 14, 17, 20 | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | DIM | Dimension of AMReX build | 3 | 1, 2, 3 | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | USE_XSDK_DEFAULTS | Use XSDK defaults settings | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_FORTRAN | Enable Fortran language | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_DP | Build with double-precision reals | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_PIC | Build Position Independent Code | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_MPI | Build with MPI support | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_OMP | Build with OpenMP support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_CUDA | Build with CUDA support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CUDA_ARCH | CUDA target architecture | Auto | User-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CUDA_MAX_THREADS | Max number of CUDA threads per block | 256 | User-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | CUDA_MAXREGCOUNT | Limits the number of CUDA registers available | 255 | User-defined | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_CUDA_FASTMATH | Enable CUDA fastmath library | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_FORTRAN_INTERFACES | Build Fortran API | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_LINEAR_SOLVERS | Build AMReX linear solvers | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_AMRDATA | Build data services | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_EB | Build Embedded Boundary support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_PARTICLES | Build particle classes | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_DP_PARTICLES | Use double-precision reals in particle classes | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_BASE_PROFILE | Build with basic profiling support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_TINY_PROFILE | Build with tiny profiling support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_TRACE_PROFILE | Build with trace-profiling support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_COMM_PROFILE | Build with comm-profiling support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_MEM_PROFILE | Build with memory-profiling support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_PROFPARSER | Build with profile parser support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_BACKTRACE | Build with backtrace support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_FPE | Build with Floating Point Exceptions checks | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_ASSERTIONS | Build with assertions turned on | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_SUNDIALS | Enable SUNDIALS 4 interfaces | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_SENSEI_IN_SITU | Enable SENSEI_IN_SITU infrastucture | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_CONDUIT | Enable Conduit support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_ASCENT | Enable Ascent support | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_HYPRE | Enable HYPRE interfaces | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_PLOTFILE_TOOLS | Build and install plotfile postprocessing tools| NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | ENABLE_TUTORIALS | Build tutorials | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ +.. table:: AMReX build options (refer to section :ref:`sec:gpu:build` for GPU-related options). + + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | Variable Name | Description | Default | Possible values | + +==============================+=================================================+=========================+=======================+ + | CMAKE_Fortran_COMPILER | User-defined Fortran compiler | | user-defined | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | CMAKE_CXX_COMPILER | User-defined C++ compiler | | user-defined | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | CMAKE_Fortran_FLAGS | User-defined Fortran flags | | user-defined | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | CMAKE_CXX_FLAGS | User-defined C++ flags | | user-defined | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | CMAKE_CXX_STANDARD | C++ standard | compiler/11 | 11, 14, 17, 20 | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_SPACEDIM | Dimension of AMReX build | 3 | 1, 2, 3 | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | USE_XSDK_DEFAULTS | Use XSDK defaults settings | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_FORTRAN | Enable Fortran language | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PRECISION | Set the precision of reals | DOUBLE | DOUBLE, SINGLE | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PIC | Build Position Independent Code | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_MPI | Build with MPI support | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_OMP | Build with OpenMP support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_GPU_BACKEND | Build with on-node, accelerated GPU backend | NONE | NONE, SYCL, HIP, CUDA | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_FORTRAN_INTERFACES | Build Fortran API | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_LINEAR_SOLVERS | Build AMReX linear solvers | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_AMRDATA | Build data services | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_EB | Build Embedded Boundary support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PARTICLES | Build particle classes | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PARTICLES_PRECISION | Set reals precision in particle classes | Same as AMReX_PRECISION | DOUBLE, SINGLE | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_BASE_PROFILE | Build with basic profiling support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_TINY_PROFILE | Build with tiny profiling support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_TRACE_PROFILE | Build with trace-profiling support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_COMM_PROFILE | Build with comm-profiling support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_MEM_PROFILE | Build with memory-profiling support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_MPI_THREAD_MULTIPLE | Concurrent MPI calls from multiple threads | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PROFPARSER | Build with profile parser support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_FPE | Build with Floating Point Exceptions checks | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_ASSERTIONS | Build with assertions turned on | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_SUNDIALS | Enable SUNDIALS 4 interfaces | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_SENSEI | Enable SENSEI_IN_SITU infrastucture | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_CONDUIT | Enable Conduit support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_ASCENT | Enable Ascent support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_HYPRE | Enable HYPRE interfaces | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PETSC | Enable PETSc interfaces | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_HDF5 | Enable HDF5-based I/O | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_PLOTFILE_TOOLS | Build and install plotfile postprocessing tools| NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_BUILD_TUTORIALS | Build tutorials | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_ENABLE_TESTS | Enable CTest suite | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_DIFFERENT_COMPILER | Allow an app to use a different compiler | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ .. raw:: latex \end{center} -The option ``CMAKE_BUILD_TYPE=Debug`` implies ``ENABLE_ASSERTION=YES``. In order to turn off -assertions in debug mode, ``ENABLE_ASSERTION=NO`` must be set explicitly while +The option ``CMAKE_BUILD_TYPE=Debug`` implies ``AMReX_ASSERTIONS=YES``. In order to turn off +assertions in debug mode, ``AMReX_ASSERTIONS=NO`` must be set explicitly while invoking CMake. @@ -496,6 +514,36 @@ are defined, AMReX default flags are used. For a detailed explanation of GPU support in AMReX CMake, refer to section :ref:`sec:gpu:build`. +Building Tutorials +------------------ + +In order to build the tutorials provided in ``Tutorials/`` alongside the AMReX library, +follows these steps: + +.. highlight:: console + +:: + + mkdir /path/to/builddir + cd /path/to/builddir + cmake [options] -DAMReX_BUILD_TUTORIALS=YES /path/to/amrex + make + + +Note that only the tutorials compatible with ``[options]`` will be built. +To run one of the tutorials, do: + +.. highlight:: console + +:: + + cd /path/to/builddir/Tutorials/group/name + ./Tutorial_group_name [input_file] + + +``[input_file]`` is any of the input files required by the tutorials and located in +``/path/to/builddir/Tutorials/group/name/`` + CMake and macOS --------------- @@ -562,7 +610,7 @@ In the above snippet, ```` is any of the targets listed in th The options used to configure the AMReX build may result in certain parts, or ``components``, of the AMReX source code -to be excluded from compilation. For example, setting ``-DENABLE_LINEAR_SOLVERS=no`` at configure time +to be excluded from compilation. For example, setting ``-DAMReX_LINEAR_SOLVERS=no`` at configure time prevents the compilation of AMReX linear solvers code. Your CMake project can check which component is included in the AMReX library via `find_package`: @@ -590,57 +638,57 @@ A list of AMReX component names and related configure options are shown in the t +------------------------------+-----------------+ | Option | Component | +==============================+=================+ - | DIM | 1D, 2D, 3D | + | AMReX_SPACEDIM | 1D, 2D, 3D | +------------------------------+-----------------+ - | ENABLE_DP | DP | + | AMReX_PRECISION | DOUBLE, SINGLE | +------------------------------+-----------------+ - | ENABLE_PIC | PIC | + | AMReX_FORTRAN | FORTRAN | +------------------------------+-----------------+ - | ENABLE_MPI | MPI | + | AMReX_PIC | PIC | +------------------------------+-----------------+ - | ENABLE_OMP | OMP | + | AMReX_MPI | MPI | +------------------------------+-----------------+ - | ENABLE_CUDA | CUDA | + | AMReX_OMP | OMP | +------------------------------+-----------------+ - | ENABLE_FORTRAN_INTERFACES | FINTERFACES | + | AMReX_CUDA | CUDA | +------------------------------+-----------------+ - | ENABLE_LINEAR_SOLVERS | LSOLVERS | + | AMReX_FORTRAN_INTERFACES | FINTERFACES | +------------------------------+-----------------+ - | ENABLE_AMRDATA | AMRDATA | + | AMReX_LINEAR_SOLVERS | LSOLVERS | +------------------------------+-----------------+ - | ENABLE_EB | EB | + | AMReX_AMRDATA | AMRDATA | +------------------------------+-----------------+ - | ENABLE_PARTICLES | PARTICLES | + | AMReX_EB | EB | +------------------------------+-----------------+ - | ENABLE_DP_PARTICLES | DPARTICLES | + | AMReX_PARTICLES | PARTICLES | +------------------------------+-----------------+ - | ENABLE_BASE_PROFILE | BASEP | + | AMReX_PARTICLES_PRECISION | PDOUBLE, PSINGLE| +------------------------------+-----------------+ - | ENABLE_TINY_PROFILE | TINYP | + | AMReX_BASE_PROFILE | BASEP | +------------------------------+-----------------+ - | ENABLE_TRACE_PROFILE | TRACEP | + | AMReX_TINY_PROFILE | TINYP | +------------------------------+-----------------+ - | ENABLE_COMM_PROFILE | COMMP | + | AMReX_TRACE_PROFILE | TRACEP | +------------------------------+-----------------+ - | ENABLE_MEM_PROFILE | MEMP | + | AMReX_COMM_PROFILE | COMMP | +------------------------------+-----------------+ - | ENABLE_PROFPARSER | PROFPARSER | + | AMReX_MEM_PROFILE | MEMP | +------------------------------+-----------------+ - | ENABLE_BACKTRACE | BACKTRACE | + | AMReX_PROFPARSER | PROFPARSER | +------------------------------+-----------------+ - | ENABLE_FPE | FPE | + | AMReX_FPE | FPE | +------------------------------+-----------------+ - | ENABLE_ASSERTIONS | ASSERTIONS | + | AMReX_ASSERTIONS | ASSERTIONS | +------------------------------+-----------------+ - | ENABLE_SUNDIALS | SUNDIALS | + | AMReX_SUNDIALS | SUNDIALS | +------------------------------+-----------------+ - | ENABLE_SENSEI_IN_SITU | SENSEI | + | AMReX_SENSEI | SENSEI | +------------------------------+-----------------+ - | ENABLE_CONDUIT | CONDUIT | + | AMReX_CONDUIT | CONDUIT | +------------------------------+-----------------+ - | ENABLE_ASCENT | ASCENT | + | AMReX_ASCENT | ASCENT | +------------------------------+-----------------+ - | ENABLE_HYPRE | HYPRE | + | AMReX_HYPRE | HYPRE | +------------------------------+-----------------+ .. raw:: latex @@ -681,17 +729,17 @@ More details on ``find_package`` can be found AMReX on Windows ================ -The AMReX team does development on Linux machines, from desktop workstations to supercomputers. Many people also use AMReX on Macs without issues. +The AMReX team does development on Linux machines, from laptops to supercomputers. Many people also use AMReX on Macs without issues. -We do not officially support AMReX on Windows. However, we believe there are no fundamental issues for making it work on Windows. -AMReX mostly uses standard C++11, and there are only a few places that are UNIX/Linux specific. These are: +We do not officially support AMReX on Windows, and many of us do not have access to any Windows +machines. However, we believe there are no fundamental issues for it to work on Windows. -(1) File system: We use some of the POSIX standard functions for operations like making a new directory, detecting if a file exists, etc. -C++17 now has a filesystem library that should work on any platform. AMReX does not require C++17, but we are happy to provide a C++17 support for the file system part. +(1) AMReX mostly uses standard C++11, but for Windows C++17 is required. This is because we use + C++17 to support file system operations when POSIX I/O is not available. -(2) Signal handling: We use POSIX handling when floating point exceptions, segmentation faults, etc. happen. -This capability allows us to print a backtrace of what leads to the error and is very useful for debugging but not required for using AMReX. -Some of the POSIX handling is platform-dependent, and Windows does seem to have this capability. If you need it, it should not be hard for you to make it work on Windows. +(2) We use POSIX signal handling when floating point exceptions, segmentation faults, etc. happen. +This capability is not supported on Windows. -(3) Memory profiling: This is an optional feature in AMReX that is not enabled by default. -It reads memory system information from the OS to give us a summary of our memory usage. +(3) Memory profiling is an optional feature in AMReX that is not enabled by default. It reads +memory system information from the OS to give us a summary of our memory usage. This is not +supported on Windows. diff --git a/Docs/sphinx_documentation/source/CVODE_top.rst b/Docs/sphinx_documentation/source/CVODE_top.rst deleted file mode 100644 index 2a30518c32b..00000000000 --- a/Docs/sphinx_documentation/source/CVODE_top.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _Chap:CVODE: - -CVODE -===== - -AMReX supports local ODE integration using the CVODE solver, [1]_ which is part -of the SUNDIALS framework. [2]_ CVODE contains solvers for stiff and non-stiff -ODEs, and as such is well suited for solving e.g., the complex chemistry -networks in combustion simulations, or the nuclear reaction networks in -astrophysical simulations. - -Most of CVODE is written in C, but many functions also come with two distinct -Fortran interfaces. One interface is FCVODE, which is bundled with the stable -release of CVODE. Its usage is described in the CVODE documentation. [3]_ -However, the use of FCVODE is discouraged in AMReX due to its incompatibility -with being used inside OpenMP parallel regions (which is the primary use case -in AMReX applications). - -The alternative, and recommended, Fortran interface to uses the -``iso_c_binding`` feature of the Fortran 2003 standard to implement a direct -interface to the C functions in CVODE. When compiling CVODE, one need not -build the CVODE library with the FCVODE interface enabled at all. Rather, the -Fortran 2003 interface to CVODE is provided within AMReX itself. The -CVODE tutorials provided in AMReX use this new interface. - -.. toctree:: - :maxdepth: 1 - - CVODE - SUNDIALS3 - - -.. [1] - https://computation.llnl.gov/projects/sundials/cvode - -.. [2] - https://computation.llnl.gov/projects/sundials - -.. [3] - https://computation.llnl.gov/sites/default/files/public/cv_guide.pdf diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst index 2407fbecffd..e6a759dae5e 100644 --- a/Docs/sphinx_documentation/source/EB.rst +++ b/Docs/sphinx_documentation/source/EB.rst @@ -549,531 +549,6 @@ testing cell types and getting neighbor information. For example end do - -.. _sec:EB:LevelSet: - -Level Sets -========== - -In order to speed up direct interactions with embedded boundaries, AMReX also -provides a way to construct level-sets representing the signed distance function -from the closest EB surface. In our implementation, the level-set data is stored -as a 1-component nodal :cpp:`MultiFab` (cf. :ref:`sec:basics:multifab`) where -each node stores its closest distance to the EB. The subroutine -:fortran:`amrex_eb_interp_levelset` (in ``/Scr/EB/AMREX_EB_levelset_F.F90``) -interpolates the level-set :math:`\phi(\mathbf{r})` to any position -:math:`\mathbf{r}` from the pre-computed level-set :cpp:`MultiFab`. Likewise the -subroutine :fortran:`amrex_eb_normal_levelset` interpolated the normal -:math:`\mathbf{\hat{n}}(\mathbf{r})` at any position from the derivative of the -level-set function :math:`\mathbf{\hat{n}}(\mathbf{r}) = \nabla -\phi(\mathrm{r})`. **Note** that since the normal is computed by taking the -derivative of the interpolation function, it is discontinuous at positions -corresponding to the nodal points of the level-set :cpp:`MultiFab` (i.e. -:math:`\mathbf{r} = (i, j, k) \cdot h`). - -At this point, AMReX does not provide a C++ interface for interpolating the -level-set at a point. This is because so far the level-set was only needed while performing calculations in Fortran. The interpolation subroutines contained in :fortran:`amrex_eb_levelset_module` are: - -.. highlight:: fortran - -:: - - pure subroutine amrex_eb_interp_levelset(pos, plo, n_refine, & - phi, phlo, phhi, & - dx, phi_interp ) - -and - -.. highlight:: fortran - -:: - - pure subroutine amrex_eb_normal_levelset(pos, plo, n_refine, & - phi, phlo, phhi, & - dx, normal ) - -which interpolate the level-set value :fortran:`phi_interp` and -:fortran:`normal`, respectively, at the 3-dimensional point :fortran:`pos`. The -nodal values of the level-set are given by the :fortran:`phi` array. -:fortran:`dx/n_refine` is the refined cell-size of the level-set array. For -example - -.. highlight:: fortran - -:: - - use iso_c_binding , only : c_int - use amrex_fort_module, only : c_real => amrex_real - use amrex_eb_levelset_module, only: amrex_eb_interp_levelset - - ! ** level-set data - ! philo, phihi - dimensions of phi array - ! dx - spatial discretization - ! n_refine - refinement of phi array (wrt to dx) - integer(c_int) :: philo(3), phihi(3) - real(c_real) :: phi( phlo(1):phhi(1), phlo(2):phhi(2), phlo(3):phhi(3) ) - real(c_real) :: dx(3) - integer(c_int) :: n_refine - - ! ** interpolated level-set - ! pos - coordinate where to interpolate - ! ls_value - interpolated level-set value (output) - real(c_real) :: pos(3), ls_value - - call amrex_eb_interp_levelset(pos, plo, n_refine, phi, phlo, phhi, dx, ls_value); - - -AMReX provides collection of functions and subroutines to fill single and -multi-level level-set data. For convenience, the :cpp:`amrex::LSFactory` helps -manage the level-set data for a single AMR level. And :cpp:`amrex::LSCore` -manages multi-level level-set data. These are described in further detail below. - - -A Note on Filling Level-Sets from :cpp:`EBFArrayBoxFactory` ------------------------------------------------------------ - -The data stored in a :cpp:`EBFArrayBoxFactory`, represents the embedded boundary -as a discrete collection of volume fractions, and area fractions over a grid. -Here this is further simplified by thinking of the EB as a collection of planar -facets. This means that for any given node in a grid, the nearest EB facet might -be in another grid. Hence if the :cpp:`EBFArrayBoxFactory` has :cpp:`n_pad` -ghost cells, then for any given grid, there could be EB facets that are -:cpp:`n_pad + 1` cells away, yet we would *not* "see". In other words, if the -:cpp:`EBFArrayBoxFactory` is defined on a grid with spacing :math:`h`, then, and -we do not have any EB facets in the current grid, then any node within that grid -is *at least* :math:`(n_\mathrm{pad}+1)h` away from the nearest EB surface. - -Hence, when filling a level-set, it will "max-out" at -:math:`\pm(n_\mathrm{pad}+1)h`. Hence it is recommended to think of this kind of -level-set function as the point being "at least" :math:`\phi(\mathbf{r})` from -the EB surface. - -.. _fig::local_levelset: - -.. figure:: ./EB/loc_ls_ex.png - :width: 50.0% - - : Example of a "local" level-set representing a cylinder. The level-set - function is a (linear) signed distance function near the EB-surface, and it - plateaus further away from it. - -Figure :numref:`fig::local_levelset` shows an example of such a local level-set -description for a cylinder. Only cells that are within -:math:`\pm(n_\mathrm{pad}+1)h` of the EB surface are filled with a level-set. -The rest is filled with lower (upper) bound. If the goal is capture interactions -between the EB surface and a point somewhere else, this approach usually -suffices as we only need to know if we are "far enough" from the EB in most -applications. - -Since finding the closest distance between a point and an arbitrary surface is -computationally expensive, we advice that :cpp:`n_pad` is chosen as the smallest -necessary number for the application. - - -.. _ss:ls:nolsf: - -Filling Level-Sets without :cpp:`LSFactory` -------------------------------------------- - -The static function :cpp:`amrex::LSFactory::fill_data` (defined in -``Src/EB/AMReX_EB_levelset.cpp``) fills a :cpp:`MultiFab` with the nodal level-set -values and another :cpp:`iMultiFab` with integer tags that are 1 whenever a node -is near the EB surface. It is then left up to the application to manage the -level-set :cpp:`MultiFab`. - -AMReX defines embedded surfaces using implicit functions (see above). Normally -these implicit functions are usually *not* signed distance functions (i.e. their -value at :math:`\mathbf{r}` is not the minimal distance to the EB surface). -However, in rare cases such as the :cpp:`EB2::PlaneIF`, it is. In this case, the -most straight-forward way to fill a level-set. If an signed-distance implicit -function is know, and stored as a :cpp:`MultiFab mf_impfunc`, then we can use - -.. highlight:: c++ - -:: - - static void fill_data (MultiFab & data, iMultiFab & valid, - const MultiFab & mf_impfunc, - int eb_pad, const Geometry & eb_geom); - -so then the function call - -.. highlight:: c++ - -:: - - // Fill implicit function - GShopLSFactory cylinder_lsgs(cylinder_ghsop, geom, ba, dm, 0); - std::unique_ptr cylinder_mf_impfunc = cylinder_lsgs.fill_impfunc(); - - - MultiFab ls_grid(ba, dm, 1, 0); - iMultiFab ls_valid(ba, dm, 1, 0); - amrex::LSFactory::fill_data(ls_grid, ls_valid, mf_impfunc, 2, geom_eb); - -fills a :cpp:`MultiFab ls_grid` with level-set data given the implicit function -stored in the :cpp:`MultiFab mf_impfunc`, and a threshold of -:cpp:`2*geom_eb.CellSize()`. The helper class :cpp:`GShopLSFactory` converts EB2 -implicit functions to :cpp:`MultiFabs` (defined in -``Src/EB/AMReX_EB_levelset.H``). - -The much more interesting application of :cpp:`amrex::LSFactory::fill_data` is -filling a level-set given a :cpp:`EBFArrayBoxFactory`: - -.. highlight:: c++ - -:: - - static void fill_data (MultiFab & data, iMultiFab & valid, - const EBFArrayBoxFactory & eb_factory, - const MultiFab & eb_impfunc, - const IntVect & ebt_size, int ls_ref, int eb_ref, - const Geometry & geom, const Geometry & geom_eb); - -which fills the :cpp:`MultiFab data` with level-set data from the -:cpp:`EBFArrayBoxFactory eb_factory`. Here the user must still supply the EB -implicit function using the :cpp:`MultiFab eb_impfunc`, as this is used to -determine the inside/outside when no EB facets can be found, or in special -edge-cases. The user also needs to specify the tile size (:cpp:`IntVect -ebt_size`), the level-set and EB refinement (i.e. the grid over which -:cpp:`data` is defined is refined by a factor of :cpp:`ls_ref/eb_ref` compared -to the :cpp:`eb_factory` 's grid), and the Geometries :cpp:`geom` and -:cpp:`geom_eb` corresponding to the grids of :cpp:`data` and :cpp:`eb_factory` -respectively. - -When filling :cpp:`data`, a tile-size of :cpp:`ebt_size` is used. Only EB facets -within a tile (plus the :cpp:`eb_factory` ghost cells) are considered. Hence, -chosing an appropriate :cpp:`ebt_size` can significantly increase performance. - -For example, the following fills a level-set with a cylinder EB (like that shown -in Fig. :numref:`fig::local_levelset`). - -.. highlight:: c++ - -:: - - // Define nGrow of level-set and EB - int ls_pad = 1; - int eb_pad = 2; - - // Define EB - EB2::CylinderIF cylinder(radius, centre, true); - EB2::GeometryShop cylinder_gshop(cylinder); - - // Build EB - EB2::Build(cylinder_gshop, geom, max_level, max_level); - const EB2::IndexSpace & cylinder_ebis = EB2::IndexSpace::top(); - const EB2::Level & cylinder_lev = cylinder_ebis.getLevel(geom); - - // Build EB factory - EBFArrayBoxFactory eb_factory(cylinder_lev, geom, ba, dm, {eb_pad, eb_pad, eb_pad}); - - // Fill implicit function - GShopLSFactory cylinder_lsgs(cylinder_ghsop, geom, ba, dm, ls_pad); - std::unique_ptr cylinder_mf_impfunc = cylinder_lsgs.fill_impfunc(); - - // Fill level-set - MultiFab ls_grid(ba, dm, 1, ls_pad); - iMultiFab ls_valid(ba, dm, 1, ls_pad); - LSFactory::fill_data(ls_grid, ls_valid, eb_factory, * cylinder_mf_impfunc, - ebt_size, 1, 1, geom, geom); - -Note that in theory the :cpp:`EBFArrayBoxFactory eb_factory` could be defined on -a different resolution as the the :cpp:`BoxArray ba`. In this case, the -appropriate refinements and geometries must be specified. Also note that the -thresholding behaviour (due to :cpp:`eb_pad`) is specified via the -:cpp:`EBFArrayBoxFactory` constructor. The implicit function MultiFab needs to -have the same grids as `data`. - -Since this relies on the interplay of many different parameters, a number of -utility functions and helper classes have been created. These are discussed in -the subsequent sections. - -The common operations of intersections and unions (similar to EB implicit -functions, discussed in :ref:`sec:EB:ebinit:IF`) can also be applied to -level-sets. Without the use of a :cpp:`LSFactory`, the functions: - -.. highlight:: c++ - -:: - - static void intersect_data (MultiFab & data, iMultiFab & valid, - const MultiFab & data_in, const iMultiFab & valid_in, - const Geometry & geom_ls); - -and - -.. highlight:: c++ - -:: - - static void union_data (MultiFab & data, iMultiFab & valid, - const MultiFab & data_in, const iMultiFab & valid_in, - const Geometry & geom_ls); - -These apply the intersection (element-wise minimum) and union (maximum) between -the :cpp:`MultiFab data`, and :cpp:`data_in`. The result overwrites the contents -of :cpp:`data`. The tags stored in the :cpp:`iMultiFab valid_in` determine where -the intersection takes place (i.e. only cells where both :cpp:`valid_in == 1` -are intersected, others are ignored). - - -Using :cpp:`LSFactory` ----------------------- - -In the previous section, we've seen that the level-set and EB grids can exist on -different levels of refinement. The practical reason behind this is that -sometimes we want to capture interactions that are very sensitive close to EBs, -but this can sometimes be difficult to keep track of. Hence the :cpp:`LSFactory` -can be helpful in taking care of all of these parameters. - -The basic principle of the :cpp:`LSFactory` (defined in -``Src/EB/AMReX_EB_levelset.H``) is that it is created relative to some reference -:cpp:`BoxArray ba`, :cpp:`Geometry geom`, and :cpp:`DistributionMapping dm`. The -user then specifies refinement factors :cpp:`ls_ref` of the level-set data and -:cpp:`eb_ref` of the EB grid. Calling the constructor: - -.. highlight:: c++ - -:: - - LSFactory(int lev, int ls_ref, int eb_ref, int ls_pad, int eb_pad, - const BoxArray & ba, const Geometry & geom, const DistributionMapping & dm, - int eb_tile_size = 32); - -Then creates all appropriate grids and geometries. Note that we can also specify -the tile size used internally in the :cpp:`LSFactory::fill_data` function. - -When a :cpp:`LSFacotry` is first created, its level-set values are set to -:fortran:`huge(amrex_real)`. I. e. there are no surfaces, and so the level-set -value is effectively infinite. It can then be filled just like in the previous -section: - -.. highlight:: c++ - -:: - - // Define refinement of level-set and EB - int ls_ref = 4; - int eb_ref = 1; - - // Define nGrow of level-set and EB - int ls_pad = 1; - int eb_pad = 2; - - // Define EB - EB2::CylinderIF cylinder(radius, centre, true); - EB2::GeometryShop cylinder_gshop(cylinder); - - // Build level-set factory - LSFactory level_set(0, ls_ref, eb_ref, ls_pad, eb_pad, ba, geom, dm); - - // Build EB - const Geometry & eb_geom = level_set.get_eb_geom() - EB2::Build(cylinder_gshop, eb_geom, max_level, max_level); - - const EB2::IndexSpace & cylinder_ebis = EB2::IndexSpace::top(); - const EB2::Level & cylinder_lev = cylinder_ebis.getLevel(eb_geom); - - // Build EB factory - EBFArrayBoxFactory eb_factory(cylinder_lev, eb_geom, level_set.get_eb_ba(), dm, - {level_set.get_eb_pad(), level_set.get_eb_pad(), - level_set.get_eb_pad()}); - - // Fill level-set (factory) - GShopLSFactory cylinder_lsgs(cylinder_ghsop, level_set); - std::unique_ptr cylinder_mf_impfunc = cylinder_lsgs.fill_impfunc(); - level_set.Fill(eb_factory, * cylinder_mf_impfunc); - -where the level-set data can now be accessed using: - -.. highlight:: c++ - -:: - - const MultiFab * level_set_data = level_set.get_data(); - -or alternatively a copy of the data can be generated using: - -.. highlight:: c++ - -:: - - std::unique_ptr level_set_data = level_set.copy_data(); - -Both of the data above are on grids that have been refined by :cpp:`ls_ref` -(with respect to the :cpp:`BoxArray ba`). In order to get a copy of the -level-set data at the coarseness of the original grids, use: - -.. highlight:: c++ - -:: - - std::unique_ptr level_set_data_crse = level_set.coarsen_data(); - -Note however, that the level-set data is nodal data. Therefore, even though the -:cpp:`MultiFab level_set_data_crse` is defined on a grid with the same -resolution as the :cpp:`BoxArray ba`, it is defined on the nodal version of that -grid. - -The :cpp:`LSFactory` is also there to make operations on the level-set easier. -Intersection and Union operations with EB factories and implicit functions are -available in the :cpp:`LSFactory` class. As well as functions to regrid -(updating the underlying :cpp:`BoxArray` and :cpp:`DistributionMapping`), -copying, and inverting the level-set function. - - -Filling Multi-Level Level-Sets without :cpp:`LSCore` ----------------------------------------------------- - -AMReX also provides code to fill the level-set function on different levels of -refinement. The static function :cpp:`amrex::LSCoreBase::FillLevelSet`, -:cpp:`amrex::LSCoreBase::MakeNewLevelFromCoarse`, and -:cpp:`amrex::LSCoreBase::FillVolfracTags` (or -:cpp:`amrex::LSCoreBase::FillLevelSetTags` for level-set tagging instead of -volume-fraction tagging) fill a finer level from a coarse one. Just like the -section on :ref:`ss:ls:nolsf`, the philosophy here is to enable to user to fill -a :cpp:`MultiFab` with level-set values, and manage this data structure -themselves. Later we will discuss the :cpp:`LSCore` class, which automatically -constructs multi-level level-sets. - -One common problem with level-set function is that they are expensive to -compute. Therefore, a strategy would be to compromise by computing the level-set -function accurately near embedded boundaries (where precision is important), and -at a lower resolution for from walls. The function - -.. highlight:: c++ - -:: - - static void FillVolfracTags( int lev, TagBoxArray & tags, - const Vector & grids, - const Vector & dmap, - const EB2::Level & eb_lev, const Vector & geom ); - -fills a :cpp:`TagBoxArray` with tags wherever the volume fraction is between 0 -and 1. This way any cut-cells a buffered of :cpp:`amr.n_error_buf` many -neighbors is tagged for refinement. If we need finer control over the tagging, -the function - -.. highlight:: c++ - -:: - - static void FillLevelSetTags( int lev, TagBoxArray & tags, const Vector & phierr, - const MultiFab & levelset_data, const Vector & geom ); - -takes a list of threshold level-set values (:cpp:`Vector & phierr`) and -tags cells for refinement if the coarse estimate of the levelset -(:cpp:`levelset_data`) from level :cpp:`lev` is less than :cpp:`phierr[lev]`. - -The following code would then fill a multi-level hierarchy of level-sets -contained in :cpp:`Vector level_sets`. - -.. highlight:: c++ - -:: - - //___________________________________________________________________________ - // Start with level zero - - EBFArrayBoxFactory eb_factory(* eb_levels[0], geom[0], grids[0], dmap[0], - {eb_pad, eb_pad, leb_pad}, EBSupport::full); - - // NOTE: reference BoxArray is not nodal - BoxArray nd_ba = amrex::convert(grids[0], IntVect::TheNodeVector()); - - level_sets[0].define(nd_ba, dmap[0], 1, pad); - iMultiFab valid(nd_ba, dmap[0], 1, pad); - - // NOTE: implicit function data might not be on the right grids - MultiFab impfunc = MFUtil::regrid(nd_ba, dmap[0], implicit_functions[0], true); - - LSFactory::fill_data(level_sets[0], valid, ebfactory, impfunc, - 32, 1, 1, geom[0], geom[0]); - - - //___________________________________________________________________________ - // Fill finer levels, using coarser level to estimate level-set - - for (int lev = 1; lev < nlev; lev++) { - // NOTE: reference BoxArray is not nodal - BoxArray ba = amrex::convert(grids[lev], IntVect::TheNodeVector()); - level_sets[lev].reset(new MultiFab); - iMultiFab valid(ba, dmap[lev], 1, pad); - - // Fills level_sets[lev] with coarse data - LSCoreBase::MakeNewLevelFromCoarse( level_sets[lev], level_sets[lev-1], - ba, dmap[lev], geom[lev], geom[lev-1], - bcs_ls, refRatio(lev-1)); - - EBFArrayBoxFactory eb_factory(* eb_levels[lev], geom[lev], grids[lev], dmap[lev], - {eb_pad, eb_pad, eb_pad}, EBSupport::full); - - // NOTE: implicit function data might not be on the right grids - MultiFab impfunc = MFUtil::regrid(ba, dmap[lev], implicit_functions[lev]); - - IntVect ebt_size{AMREX_D_DECL(32, 32, 32)}; // Fudge factors... - LSCoreBase::FillLevelSet(level_sets[lev], level_sets[lev], eb_factory, impfunc, - ebt_size, eb_pad, geom[lev]); - } - -Here the :cpp:`Vector eb_levels` has been filled while -initializing the embedded boundaries. At the same time, the implicit functions -need to be saved to :cpp:`Vector implicit_functions`. The user also -needs to specify the level-set boundary conditions in :cpp:`Vector -bcs_ls`. Note that the function :cpp:`LSCoreBase::FillLevelSet` uses the coarse -level-set as an upper bound to the tile size used for testing EB facets. - - - -Using :cpp:`LSCore` -------------------- - -The process described in the previous section is automated in the :cpp:`LSCore` -class. It is derived from :cpp:`LSCoreBase`, which in turn is derived from -:cpp:`AmrCore` (cf. :ref:`Chap:AmrCore`). :cpp:`LSCore` is a template class -depending on the embedded boundary implicit function. This way, it can build new -:cpp:`EB2::Level` objects for every new level that is needed. - -Since :cpp:`LSCore` is a template class, it might lead to problems in -applications where the template parameter can depend of runtime parameters. This -is the reason why it derives from the base class :cpp:`LSCoreBase`. -:cpp:`LSCore` overwrites the virtual function :cpp:`MakeNewLevelFromScratch` in -:cpp:`LSCoreBase`. The application can then employ the following polymorphism to -construct the level-set; - -.. highlight:: c++ - -:: - - LSCoreBase * ls_core; - - // sets ls_core pointer - make_my_eb(ls_core); - - ls_core->InitData(); - -where the function :cpp:`make_my_eb` defines the actual EB geometry: - -.. highlight:: c++ - -:: - - void make_my_eb(LSCoreBase *& ls_core) { - - // MyIF is an EB2 Implicit Fuction - GeometryShop gshop; - - // Build an EB geometry shop here - - ls_core = new LSCore(gshop); - } - -Here the :cpp:`make_my_eb` is only defines the EB geometry. The function call -:cpp:`ls_core->InitData()` constructs level hierarchy and fills it with -level-set values. - Linear Solvers ============== diff --git a/Docs/sphinx_documentation/source/EB/loc_ls_ex.png b/Docs/sphinx_documentation/source/EB/loc_ls_ex.png deleted file mode 100644 index 48f6aa654d3..00000000000 Binary files a/Docs/sphinx_documentation/source/EB/loc_ls_ex.png and /dev/null differ diff --git a/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst b/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst index 97bcb96c9c7..3d15c083036 100644 --- a/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst +++ b/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst @@ -6,5 +6,5 @@ External Frameworks .. toctree:: :maxdepth: 1 - CVODE_top + SUNDIALS_top SWFFT diff --git a/Docs/sphinx_documentation/source/External_Profiling_Tools.rst b/Docs/sphinx_documentation/source/External_Profiling_Tools.rst index 6a45a3bd938..71222d7588e 100644 --- a/Docs/sphinx_documentation/source/External_Profiling_Tools.rst +++ b/Docs/sphinx_documentation/source/External_Profiling_Tools.rst @@ -279,3 +279,202 @@ generated figures. Some examples are shown here. .. [5] https://www.nersc.gov/users/software/performance-and-debugging-tools/ipm/ + +Nsight Systems +============== + +The Nsight Systems tool provides a high-level overview of your code, displaying the kernel +launches, API calls, NVTX regions and more in a timeline for a clear, visual picture of the +overall runtime patterns. It analyzes CPU-codes or CUDA-based GPU codes and is available +on Summit and Cori in a system module. + +Nsight Systems provides a variety of profiling options. This documentation will cover the +most commonly used options for AMReX users to keep track of useful flags and analysis +patterns. For the complete details of using Nsight Systems, refer to the `Nsight Systems +official documentation `_. + +Profile Analysis +---------------- + +The most common use case of Nsight Systems for AMReX users is the creation of a qdrep file +that is viewed in the Nsight Systems GUI, typically on a local workstation or machine. + +To generate a qdrep file, run nsys with the ``-o`` option: + +.. highlight:: console + +:: + + nsys profile -o ${EXE} ${INPUTS} + +AMReX's lambda-based launch system often makes these timelines difficult to parse, as the kernel +are mangled and are difficult to decipher. AMReX's Tiny Profiler includes NVTX region markers, +which can be used to mark the respective section of the Nsight Systems timeline. To include AMReX's +built-in Tiny Profiler NVTX regions in Nsight Systems outputs, compile AMReX with ``TINY_PROFILE=TRUE``. + +Nsight Systems timelines only profile a single, contiguous block of time. There are a variety of +methods to specify the specific region you would like to analyze. The most common options that AMReX users +may find helpful are: + +1. **Specify an NVTX region as the starting point of the analysis.** + +This is done using ``-c nvtx -p "region_name@*" -e NSYS_NVTX_PROFILER_REGISTER_ONLY=0``, where ``region_name`` +is the identification string for the of the NVTX region. The additional environment variable, +``-e ...`` is needed because AMReX's NVTX region names currently do not use a registered string. +TinyProfiler's built-in NVTX regions use the same identification string as the timer itself. For +example, to start an analysis at the ``do_hydro`` NVTX region, run: + +.. highlight:: console + +:: + + nsys profile -o -c nvtx -p "do_hydro@*" -e NSYS_NVTX_PROFILER_REGISTER_ONLY=0 ${EXE} ${INPUTS} + +This will profile from the first instance of the specified NVTX region until the end of the +application. In AMReX applications, this can be helpful to skip initialization and analyze the +remainder of the code. To only analyze the specified NVTX region, add the flag ``-x true``, which +will end the analysis at the end of the region: + +.. highlight:: console + +:: + + nsys profile -o -c nvtx -p "do_hydro@*" -x true -e NSYS_NVTX_PROFILER_REGISTER_ONLY=0 ${EXE} ${INPUTS} + +Again, it's important to remember that Nsight Systems only analyzes a single contiguous block of +time. So, this will only give you a profile for the first instance of the named region. Plan your +Nsight System analyses accordingly. + +2. **Specify a region with cuda profiler function calls.** + +This requires manually altering your source code, but can provide better specificity in what you analyze. +Directly insert ``cudaProfilerStart\Stop`` around the region of code you want to analyze: + +.. highlight:: c++ + +:: + + cudaProfilerStart(); + + // CODE TO PROFILE + + cudaProfilerStop(); + + +Then, run with ``-c cudaProfilerApi``: + +.. highlight:: console + +:: + + nsys profile -o -c cudaProfilerApi ${EXE} ${INPUTS} + +As with NVTX regions, Nsight Systems will only profile from the first call to ``cudaProfilerStart()`` +to the first call to ``cudaProfilerStop()``, so be sure to add these markers appropriately. + + +Nsight Systems GUI Tips +----------------------- + +* When analyzing an AMReX application in the Nsight Systems GUI using NVTX regions or ``TINY_PROFILE=TRUE``, + AMReX users may find it useful to turn on the feature "Rename CUDA Kernels by NVTX". This will change the + CUDA kernel names to match the inner-most NVTX region in which they were launched instead of the typical + mangled compiler name. This will make identifying AMReX CUDA kernels in Nsight Systems reports considerably easier. + + This feature can be found in the GUI's drop down menu, under: + +.. highlight:: console + +:: + + Tools -> Options -> Environment -> Rename CUDA Kernels by NVTX. + + +Nsight Compute +============== + +The Nsight Compute tool provides a detailed, fine-grained analysis of your CUDA kernels, +giving details about the kernel launch, occupancy, and limitations while suggesting possible +improvements to maximize the use of the GPU. It analyzes CUDA-based GPU codes and is available +on Summit and Cori in system modules. + +Nsight Compute provides a variety of profiling options. This documentation will focus on the +most commonly used options for AMReX users, primarily to keep track of useful flags and analysis +patterns. For the complete details of using Nsight Compute, refer to the `Nsight compute +official documentation `_. + + +Kernel Analysis +--------------- + +The standard way to run Nsight Compute on an AMReX application is to specify an output file +that will be transferred to a local workstation of machine for viewing in the Nsight Compute GUI. +Nsight Compute can be told to return a report file using the ``-o`` flag. In addition, when +running with Nsight compute on an AMReX application, it is important to turn off the floating +point exception trap, as it causes a runtime error. So, an entire AMReX application can be +analyzed with Nsight Compute by running: + +.. highlight:: console + +:: + + ncu -o ${EXE} ${INPUTS} amrex.fpe_trap_invalid=0 + +However, this implementation should almost never used by AMReX applications, as the analysis of +every kernel would be extremely lengthy and unnecessary. To analyze a desired subset of CUDA +kernels, AMReX users can use the Tiny Profiler's built-in NVTX regions to narrow the scope of +the analysis. Nsight Compute allows users to specify which NVTX regions to include and exclude +through the ``--nvtx``, ``--nvtx-include`` and ``--nvtx-exclude`` flags. For example: + +.. highlight:: console + +:: + + ncu --nvtx --nvtx-include "Hydro()" --nvtx-exclude "StencilA(),StencilC()" -o kernels ${EXE} ${INPUTS} amrex.fpe_trap_invalid=0 + +will return a file named ``kernels`` which contains an analysis of the CUDA kernels launched inside +the ``Hydro()`` region, ignoring any kernels launched inside ``StencilA()`` and ``StencilC()``. +When using the NVTX regions built into AMReX's TinyProfiler, be aware that the application must be built +with ``TINY_PROFILE=TRUE`` and the NVTX region names are identical to the TinyProfiler timer names. + +Another helpful flag for selecting a reasonable subset of kernels for analysis is the ``-c`` option. This +flag specifies the total number of kernels to be analyzed. For example: + +.. highlight:: console + +:: + + ncu --nvtx --nvtx-include "GravitySolve()" -c 10 -o kernels ${EXE} ${INPUTS} amrex.fpe_trap_invalid=0 + +will only analyze the first ten kernels inside of the ``GravitySolve()`` NVTX region. + +For further details on how to choose a subset of CUDA kernels to analyze, or to run a more detailed +analysis, including CUDA hardware counters, refer to the Nsight Compute official documentation on +`NVTX Filtering `_. + + +Roofline +-------- + +As of version 2020.1.0, Nsight Compute has added the capability to perform roofline analyses on CUDA +kernels to describe how well a given kernel is running on a given NVIDIA architecture. For details +on the roofline capabilities in Nsight Compute, refer to the `NVIDIA Kernel Profiling Guide +`_. + +To run a roofline analysis on an AMReX application, run ``ncu`` with the flag +``--section SpeedOfLight_RooflineChart``. Again, using appropriate NVTX flags to limit the scope of the +analysis will be critical to achieve results within a reasonable time. For example: + +.. highlight:: console + +:: + + ncu --section SpeedOfLight_RooflineChart --nvtx --nvtx-include "MLMG()" -c 10 -o roofline ${EXE} ${INPUTS} amrex.fpe_trap_invalid=0 + +will perform a roofline analysis of the first ten kernels inside of the region ``MLMG()``, and report +their relative performance in the file ``roofline``, which can be read by the Nsight Compute GUI. + +For further information on the roofline model, refer to the scientific literature, `Wikipedia +overview `_, NERSC +`documentation `_ and +`tutorials `_. diff --git a/Docs/sphinx_documentation/source/External_Profiling_Tools_Chapter.rst b/Docs/sphinx_documentation/source/External_Profiling_Tools_Chapter.rst index 3ec6c6c3666..dc62454e8f8 100644 --- a/Docs/sphinx_documentation/source/External_Profiling_Tools_Chapter.rst +++ b/Docs/sphinx_documentation/source/External_Profiling_Tools_Chapter.rst @@ -3,6 +3,10 @@ External Profiling Tools =========================== +AMReX is compatible with most commonly used profiling tools. This chapter provides some +selected useful documentation on implementing a few of these tools on AMReX. For additional +details on running these tools, please refer to the official documentation of the tools. + .. toctree:: :maxdepth: 1 diff --git a/Docs/sphinx_documentation/source/Fortran.rst b/Docs/sphinx_documentation/source/Fortran.rst index 46286073e00..9631374e53b 100644 --- a/Docs/sphinx_documentation/source/Fortran.rst +++ b/Docs/sphinx_documentation/source/Fortran.rst @@ -276,7 +276,7 @@ example, ! mf1 is still the owner of the data. call amrex_multifab_destroy(mf1) ! mf2 no longer contains a valid pointer because mf1 has been destroyed. - call amrex_multifab_destroyed(mf2) ! But we still need to destroy it. + call amrex_multifab_destroy(mf2) ! But we still need to destroy it. If we need to transfer the ownership, :fortran:`amrex_multifab`, :fortran:`amrex_boxarray` and :fortran:`amrex_distromap` provide type-bound @@ -290,7 +290,7 @@ If we need to transfer the ownership, :fortran:`amrex_multifab`, call amrex_multifab_build(mf1, ...) call mf2%move(mf1) ! mf2 is now the data owner and mf1 is not. call amrex_multifab_destroy(mf1) - call amrex_multifab_destroyed(mf2) + call amrex_multifab_destroy(mf2) :fortran:`amrex_multifab` also has a type-bound :fortran:`swap` procedure for exchanging the data. diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 9e507e42fdc..c26fade1e2b 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -182,44 +182,201 @@ can run it and that will generate results like: Building with CMake ------------------- -To build AMReX with GPU support in CMake, add ``-DENABLE_CUDA=YES`` to the -``cmake`` invocation. By default, CMake will try to determine which GPU -architecture is supported by the system. If more than one is found, CMake -will build for all of them. This will generally results in a larger library and longer build times. -If autodetection fails, a set of "common" architectures is assumed. -You can specify the target architecture to build for via the configuration option -``-DCUDA_ARCH=``, where ```` can be either -the name of the NVIDIA GPU, i.e. ``Turing``, ``Volta``, ``Pascal``, ``...`` , or its -version number, i.e. ``10.0``, ``9.0``, ``8.0``, ``...`` . + +Enabling CUDA support +^^^^^^^^^^^^^^^^^^^^^ + +To build AMReX with CUDA support in CMake, add ``-DAMReX_GPU_BACKEND=CUDA`` to the +``cmake`` invocation. For a full list of CUDA-specific configuration options, +check the :ref:`table ` below. + +.. raw:: latex + + \begin{center} + +.. _tab:cmakecudavar: + +.. table:: AMReX CUDA-specific build options + + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | Variable Name | Description | Default | Possible values | + +==============================+=================================================+=============+=================+ + | AMReX_CUDA_ARCH | CUDA target architecture | Auto | User-defined | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_FASTMATH | Enable CUDA fastmath library | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_BACKTRACE | Host function symbol names (e.g. cuda-memcheck)| Auto | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_COMPILATION_TIMER | CSV table with time for each compilation phase | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_DEBUG | Device debug information (optimizations: off) | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_ERROR_CAPTURE_THIS| Error if a CUDA lambda captures a class' this | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_KEEP_FILES | Keep intermediately files (folder: nvcc_tmp) | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_LTO | Enable CUDA link-time-optimization | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_MAX_THREADS | Max number of CUDA threads per block | 256 | User-defined | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_MAXREGCOUNT | Limits the number of CUDA registers available | 255 | User-defined | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_PTX_VERBOSE | Verbose code generation statistics in ptxas | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_SHOW_CODELINES | Source information in PTX (optimizations: on) | Auto | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_SHOW_LINENUMBERS | Line-number information (optimizations: on) | Auto | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_CUDA_WARN_CAPTURE_THIS | Warn if a CUDA lambda captures a class' this | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ +.. raw:: latex + + \end{center} + + +The target architecture to build for can be specified via the configuration option +``-DAMReX_CUDA_ARCH=``, where ```` can be either +the name of the NVIDIA GPU generation, i.e. ``Turing``, ``Volta``, ``Ampere``, ``...`` , or its +`compute capability `_, i.e. ``10.0``, ``9.0``, ``...`` . For example, on Cori GPUs you can specify the architecture as follows: .. highlight:: console :: - cmake [options] -DENABLE_CUDA=yes -DCUDA_ARCH=Volta /path/to/amrex/source + cmake [options] -DAMReX_GPU_BACKEND=CUDA -DAMReX_CUDA_ARCH=Volta /path/to/amrex/source -Note that AMReX only supports GPU architectures with version number ``6.0`` or higher. +If no architecture is specified, CMake will default to the architecture defined in the +*environment variable* ``AMREX_CUDA_ARCH`` (note: all caps). +If the latter is not defined, CMake will try to determine which GPU +architecture is supported by the system. If more than one is found, CMake will build for all of them. +This will generally results in a larger library and longer build times. +If autodetection fails, a set of "common" architectures is assumed. +**Note that AMReX supports NVIDIA GPU architectures with compute capability 6.0 or higher and +CUDA Toolkit version 9.0 or higher**. -In order to import CUDA-enabled AMReX into your CMake project, you need to include +In order to import the CUDA-enabled AMReX library into your CMake project, you need to include the following code into the appropriate CMakeLists.txt file: .. highlight:: console - + :: # Find CUDA-enabled AMReX installation find_package(AMReX REQUIRED CUDA) - # Add custom CUDA flags - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ") + +If instead of using an external installation of AMReX you prefer to include AMReX as a subproject +in your CMake setup, we strongly encourage you to use the ``AMReX_SetupCUDA`` module as shown below: + +.. highlight:: console + +:: + + # Enable CUDA in your CMake project + enable_language(CUDA) + + # Include the AMReX-provided CUDA setup module + include(AMReX_SetupCUDA) + + # Include AMReX source directory ONLY AFTER the two steps above + add_subdirectory(/path/to/amrex/source/dir) + -The snippet of code above will find a CUDA-enabled installation of AMReX and setup -the CUDA support in the host project CMake via the AMReX-provided macro ``setup_cuda()``. -The host project should **not call directly** ``enable_language(CUDA)``. +To ensure consistency between CUDA-enabled AMReX and any CMake target that links against it, +we provide the helper function ``setup_target_for_cuda_compilation()``: + + +.. highlight:: console + +:: + + # Set all sources for my_target + target_sources(my_target source1 source2 source3 ...) + + # Setup my_target to be compiled with CUDA and be linked against CUDA-enabled AMReX + # MUST be done AFTER all sources have been assigned to my_target + setup_target_for_cuda_compilation(my_target) + + # Link against amrex + target_link_libraries(my_target AMReX::amrex) + + + +Enabling HIP support (experimental) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build AMReX with HIP support in CMake, add +``-DAMReX_GPU_BACKEND=HIP -DAMReX_AMD_ARCH= -DCMAKE_CXX_COMPILER=`` +to the ``cmake`` invocation. + +In AMReX CMake, the HIP compiler is treated as a special C++ compiler and therefore +the standard CMake variables used to customize the compilation process for C++, +for example ``CMAKE_CXX_FLAGS``, can be used for HIP as well. + + +Since CMake does not support autodetection of HIP compilers/target architectures +yet, ``CMAKE_CXX_COMPILER`` must be set to a valid HIP compiler, i.e. ``hipcc`` or ``nvcc``, +and ``AMReX_AMD_ARCH`` to the target architecture you are building for. +Thus **AMReX_AMD_ARCH and CMAKE_CXX_COMPILER are required user-inputs when AMReX_GPU_BACKEND=HIP**. +Below is an example configuration for HIP on Tulip: + +.. highlight:: console + +:: + + cmake -DAMReX_GPU_BACKEND=HIP -DCMAKE_CXX_COMPILER=$(which hipcc) -DAMReX_AMD_ARCH="gfx906,gfx908" [other options] /path/to/amrex/source + + +Enabling SYCL support (experimental) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build AMReX with SYCL support in CMake, add +``-DAMReX_GPU_BACKEND=SYCL -DCMAKE_CXX_COMPILER=`` +to the ``cmake`` invocation. +For a full list of SYCL-specific configuration options, +check the :ref:`table ` below. + + +In AMReX CMake, the SYCL compiler is treated as a special C++ compiler and therefore +the standard CMake variables used to customize the compilation process for C++, +for example ``CMAKE_CXX_FLAGS``, can be used for DPCPP as well. + + +Since CMake does not support autodetection of SYCL compilers yet, +``CMAKE_CXX_COMPILER`` must be set to a valid SYCL compiler. i.e. ``dpcpp``. +Thus **CMAKE_CXX_COMPILER is a required user-input when AMReX_GPU_BACKEND=SYCL**. +At this time, **the only supported SYCL compiler is dpcpp**. +Below is an example configuration for SYCL: + +.. highlight:: console + +:: + + cmake -DAMReX_GPU_BACKEND=SYCL -DCMAKE_CXX_COMPILER=$(which dpcpp) [other options] /path/to/amrex/source + + +.. raw:: latex + + \begin{center} + +.. _tab:cmakesyclvar: + +.. table:: AMReX SYCL-specific build options + + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | Variable Name | Description | Default | Possible values | + +==============================+=================================================+=============+=================+ + | AMReX_DPCPP_AOT | Enable DPCPP ahead-of-time compilation | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ + | AMReX_DPCPP_SPLIT_KERNEL | Enable DPCPP kernel splitting | YES | YES, NO | + +------------------------------+-------------------------------------------------+-------------+-----------------+ +.. raw:: latex + + \end{center} @@ -288,17 +445,17 @@ specific type of GPU memory: .. table:: Memory Arenas - +---------------------+------------------+ - | Arena | Memory Type | - +=====================+==================+ - | The_Arena() | unified memory | - +---------------------+------------------+ - | The_Device_Arena() | device memory | - +---------------------+------------------+ - | The_Managed_Arena() | unified memory | - +---------------------+------------------+ - | The_Pinned_Arena() | pinned memory | - +---------------------+------------------+ + +---------------------+----------------------------+ + | Arena | Memory Type | + +=====================+============================+ + | The_Arena() | managed or device memory | + +---------------------+----------------------------+ + | The_Device_Arena() | device memory | + +---------------------+----------------------------+ + | The_Managed_Arena() | managed memory | + +---------------------+----------------------------+ + | The_Pinned_Arena() | pinned memory | + +---------------------+----------------------------+ .. raw:: latex @@ -315,11 +472,13 @@ to two functions: void free (void* p); :cpp:`The_Arena()` is used for memory allocation of data in -:cpp:`BaseFab`. Therefore the data in a :cpp:`MultiFab` is placed in -unified memory and is accessible from both CPU host and GPU device. +:cpp:`BaseFab`. By default, it allocates managed memory. This can be changed with +a boolean runtime parameter ``amrex.the_arena_is_managed``. +Therefore the data in a :cpp:`MultiFab` is placed in +managed memory by default and is accessible from both CPU host and GPU device. This allows application codes to develop their GPU capability gradually. :cpp:`The_Managed_Arena()` is a separate pool of -unified memory, that is distinguished from :cpp:`The_Arena()` for +managed memory, that is distinguished from :cpp:`The_Arena()` for performance reasons. If you want to print out the current memory usage of the Arenas, you can call :cpp:`amrex::Arena::PrintUsage()`. @@ -511,7 +670,7 @@ implementation is reproduced here: .. highlight:: c++ :: - + Real MultiFab::Dot (const MultiFab& x, int xcomp, const MultiFab& y, int ycomp, int numcomp, int nghost, bool local) { @@ -864,9 +1023,9 @@ prepares the device launch based on a :cpp:`Box`, launches with an appropriate s GPU kernel and constructs a thread :cpp:`Box` that defines the work for each thread. On the CPU, the thread :cpp:`Box` is set equal to the total launch :cpp:`Box`, so tiling works as expected. On the GPU, the thread :cpp:`Box` usually -contains a single cell to allow all GPU threads to be utilized effectively. +contains a single cell to allow all GPU threads to be utilized effectively. -An example of a generic function launch is shown here: +An example of a generic function launch is shown here: .. highlight:: c++ @@ -1069,24 +1228,17 @@ as: and "b; }" as another. Real a; <---- OK - Real b; + Real b; }); Users that choose to implement the macro launches should be aware of the limitations of C++ preprocessing macros to ensure GPU offloading is done properly. -Finally, AMReX's expected OpenMP strategy for GPUs is to utilize OpenMP -in CPU regions to maintain multi-threaded parallelism on work that cannot be -offloaded efficiently, while using CUDA independently in GPU regions. -This means OpenMP pragmas need to be maintained when ``USE_CUDA=FALSE`` -and turned off in locations CUDA is implemented when ``USE_CUDA=TRUE``. - -This can currently be implemented in preparation for an OpenMP strategy and -users are highly encouraged to do so now. This prevents having to track -down and label the appropriate OpenMP regions in the future and -clearly labels for readers that OpenMP and GPUs are not being used at the -same time. OpenMP pragmas can be turned off using the conditional pragma -and :cpp:`Gpu::notInLaunchRegion()`, as shown below: +Finally, AMReX's most common CPU threading strategy for GPU/CPU systems is to utilize +OpenMP threads to maintain multi-threaded parallelism on work chosen to run on the host. +This means OpenMP pragmas should be maintained where CPU work is performed and usually +turned off where work is offloaded onto the GPU. OpenMP pragmas can be turned +off using the conditional pragma and :cpp:`Gpu::notInLaunchRegion()`, as shown below: .. highlight:: c++ @@ -1096,11 +1248,13 @@ and :cpp:`Gpu::notInLaunchRegion()`, as shown below: #pragma omp parallel if (Gpu::notInLaunchRegion()) #endif -This should be added only to MFIter loops that contain GPU work. +It is generally expected that simply using OpenMP threads to launch GPU work quicker +will show little improvement or even perform worse. So, this conditional statement +should be added to MFIter loops that contain GPU work, unless users specifically test +the performance or are designing more complex workflows that require OpenMP. .. _sec:gpu:example: - An Example of Migrating to GPU ============================== @@ -1202,20 +1356,20 @@ To help debugging, we often use :cpp:`amrex::Assert` and GPU kernels. However, implementing these functions requires additional GPU registers, which will reduce overall performance. Therefore, it is preferred to implement such calls in debug mode only by wrapping the -calls using ``#ifdef AMREX_DEBUG``. +calls using ``#ifdef AMREX_DEBUG``. In CPU code, :cpp:`AMREX_GPU_ERROR_CHECK()` can be called to check the health of previous GPU launches. This call looks up the return message from the most recently completed GPU launch and aborts if it was not successful. Many kernel -launch macros as well as the :cpp:`MFIter` destructor include a call +launch macros as well as the :cpp:`MFIter` destructor include a call to :cpp:`AMREX_GPU_ERROR_CHECK()`. This prevents additional launches from being called if a previous launch caused an error and ensures all GPU launches within an :cpp:`MFIter` loop completed successfully before continuing work. -However, due to asynchronicity, determining the source of the error -can be difficult. Even if GPU kernels launched earlier in the code +However, due to asynchronicity, determining the source of the error +can be difficult. Even if GPU kernels launched earlier in the code result in a CUDA error, the error may not be output at a nearby call to :cpp:`AMREX_GPU_ERROR_CHECK()` by the CPU. When tracking down a CUDA launch error, :cpp:`Gpu::synchronize()` and @@ -1274,7 +1428,7 @@ to GPUs using Cuda, OpenACC, and OpenMP, please see :cpp:`Tutorials/Particles/El GPU-aware implementations of many common particle operations are provided with AMReX, including neighbor list construction and traversal, particle-mesh deposition and interpolation, parallel reductions of particle data, and a set of transformation and filtering operations that are useful when operating on sets of particles. For -examples of these features in use, please see :cpp:`Tests/Particles/`. +examples of these features in use, please see :cpp:`Tests/Particles/`. Finally, the parallel communication of particle data has been ported and optimized for performance on GPU platforms. This includes :cpp:`Redistribute()`, which moves particles back to the proper grids after their positions @@ -1350,6 +1504,32 @@ AMReX for GPUs: ... } +* Pay attention to what GPUs your job scheduler is assigning to each MPI + rank. In most cases you'll achieve the best performance when a single + MPI rank is assigned to each GPU, and has boxes large enough to saturate + that GPU's compute capacity. While there are some cases where multiple + MPI ranks per GPU can make sense (typically this would be when you have + some portion of your code that is not GPU accelerated and want to have + many MPI ranks to make that part faster), this is probably the minority + of cases. For example, on OLCF Summit you would want to ensure that your + resource sets contain one MPI rank and GPU each, using `jsrun -n N -a 1 -c 7 -g 1`, + where `N` is the total number of MPI ranks/GPUs you want to use. (See the OLCF + [job step viewer](https://jobstepviewer.olcf.ornl.gov/) for more information.) + + Conversely, if you choose to have multiple GPUs visible to each MPI rank, + AMReX will attempt to do the best job it can assigning MPI ranks to GPUs by + doing round robin assignment. This may be suboptimal because this assignment + scheme would not be aware of locality benefits that come from having an MPI + rank be on the same socket as the GPU it is managing. If you know the hardware + layout of the system you're running on, specifically the number of GPUs per + socket (`M`) and number of GPUs per node (`N`), you can set the preprocessor + defines `-DAMREX_GPUS_PER_SOCKET=M` and `-DAMREX_GPUS_PER_NODE=N`, which are + exposed in the GNU Make system through the variables `GPUS_PER_SOCKET` and + `GPUS_PER_NODE` respectively (see an example in `Tools/GNUMake/sites/Make.olcf`). + Then AMReX can ensure that each MPI rank selects a GPU on the same socket as + that rank (assuming your MPI implementation supports MPI 3.) + + .. =================================================================== Inputs Parameters @@ -1401,7 +1581,7 @@ Cuda-specific tests - Run under ``nvprof -o profile%p.nvvp ./main3d.xxxx`` for a small problem and examine page faults using nvvp - + - Run under ``cuda-memcheck`` - Run under ``cuda-gdb`` @@ -1409,8 +1589,8 @@ Cuda-specific tests - Run with ``CUDA_LAUNCH_BLOCKING=1``. This means that only one kernel will run at a time. This can help identify if there are race conditions. - - + + Limitations =========== diff --git a/Docs/sphinx_documentation/source/GettingStarted.rst b/Docs/sphinx_documentation/source/GettingStarted.rst index 0b4808849cd..534f798dc27 100644 --- a/Docs/sphinx_documentation/source/GettingStarted.rst +++ b/Docs/sphinx_documentation/source/GettingStarted.rst @@ -7,9 +7,8 @@ Downloading the Code The source code is available at https://github.com/AMReX-Codes/amrex. The GitHub repo is our central repo for development. The development branch -includes the latest state of the code, and it is merged into the master branch -on a monthly basis. The master branch is considered the release branch. The -releases are tagged with version number YY.MM (e.g., 17.04). The MM part of the +includes the latest state of the code, and it is tagged as a release +on a monthly basis with version number YY.MM (e.g., 17.04). The MM part of the version is incremented every month, and the YY part every year. Bug fix releases are tagged with YY.MM.patch (e.g., 17.04.1). diff --git a/Docs/sphinx_documentation/source/GridCreation.rst b/Docs/sphinx_documentation/source/GridCreation.rst index 0b9a37f41ba..c718e7edacc 100644 --- a/Docs/sphinx_documentation/source/GridCreation.rst +++ b/Docs/sphinx_documentation/source/GridCreation.rst @@ -42,21 +42,21 @@ Note that :cpp:`n_cell` must be given as three separate integers, one for each c However, :cpp:`max_grid_size` and :cpp:`blocking_factor` can be specified as a single value applying to all coordinate directions, or as separate values for each direction. - - if :cpp:`max_grid_size` (or :cpp:`blocking_factor`) is specified as multiple integers then the first + - If :cpp:`max_grid_size` (or :cpp:`blocking_factor`) is specified as multiple integers then the first integer applies to level 0, the second to level 1, etc. If you don't specify as many integers as there are levels, the final value will be used for the remaining levels. - - if different values of :cpp:`max_grid_size` (or :cpp:`blocking_factor`) are wanted for each coordinate direction, + - If different values of :cpp:`max_grid_size` (or :cpp:`blocking_factor`) are wanted for each coordinate direction, then :cpp:`max_grid_size_x`, :cpp:`max_grid_size_y` and :cpp:`max_grid_size_z` (or :cpp:`blocking_factor_x`, :cpp:`blocking_factor_y` and :cpp:`blocking_factor_z`) must be used. If you don't specify as many integers as there are levels, the final value will be used for the remaining levels. Additional notes: - - to create identical grids of a specific size, e.g. of length *m* in each direction, + - To create identical grids of a specific size, e.g. of length *m* in each direction, then set :cpp:`max_grid_size` = *m* and :cpp:`blocking_factor` = *m*. - - note that :cpp:`max_grid_size` is just an upper bound; with :cpp:`n_cell = 48` + - Note that :cpp:`max_grid_size` is just an upper bound; with :cpp:`n_cell = 48` and :cpp:`max_grid_size = 32`, we will typically have one grid of length 32 and one of length 16. The grid creation process at level 0 proceeds as follows (if not using the KD-tree approach): diff --git a/Docs/sphinx_documentation/source/IO.rst b/Docs/sphinx_documentation/source/IO.rst index 4bd2b8c6741..8bce21ceb37 100644 --- a/Docs/sphinx_documentation/source/IO.rst +++ b/Docs/sphinx_documentation/source/IO.rst @@ -55,7 +55,7 @@ making such strings. const std::string& pfname2 = amrex::Concatenate("plt",istep,4); // plt0258 istep =1234567; // Having more than 5 digits is OK. - const std::string& pfname3 = amrex::Concatenate("plt",istep); // plt12344567 + const std::string& pfname3 = amrex::Concatenate("plt",istep); // plt1234567 The argument :cpp:`mf` above (:cpp:`MultiFab` for single level and :cpp:`Vector` for multi-level) is the data to be written diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst index 9759b7f1904..d205c732cc0 100644 --- a/Docs/sphinx_documentation/source/LinearSolvers.rst +++ b/Docs/sphinx_documentation/source/LinearSolvers.rst @@ -81,7 +81,7 @@ one needs to call the member function The :cpp:`int amrlev` parameter should be zero for single-level solves. For multi-level solves, each level needs to be provided with -``alpha`` and ``beta``, or ``Sigma``. For composite solves, :cpp:`amrlev` 0 will +``alpha`` and ``beta``, or ``sigma``. For composite solves, :cpp:`amrlev` 0 will mean the lowest level for the solver, which is not necessarily the lowest level in the AMR hierarchy. This is so solves can be done on different sections of the AMR hierarchy, e.g. on AMR levels 3 to 5. @@ -238,7 +238,7 @@ There are many parameters that can be set. Here we discuss some commonly used ones. :cpp:`MLLinOp::setVerbose(int)`, :cpp:`MLMG::setVerbose(int)` and -:cpp:`MLMG:setBottomVerbose(int)` can be control the verbosity of the +:cpp:`MLMG:setBottomVerbose(int)` control the verbosity of the linear operator, multigrid solver and the bottom solver, respectively. The multigrid solver is an iterative solver. The maximal number of @@ -266,8 +266,9 @@ operators for the multigrid. // out = L(in) mlmg.apply(out, in); // here both in and out are const Vector& -At the bottom of the multigrid cycles, we use the biconjugate gradient -stabilized method as the bottom solver. :cpp:`MLMG` member method +At the bottom of the multigrid cycles, we use a ``bottom solver`` which may be +different than the relaxation used at the other levels. The default bottom solver is the +biconjugate gradient stabilized method, but can easily be changed with the :cpp:`MLMG` member method .. highlight:: c++ @@ -275,7 +276,7 @@ stabilized method as the bottom solver. :cpp:`MLMG` member method void setBottomSolver (BottomSolver s); -can be used to change the bottom solver. Available choices are +Available choices are - :cpp:`MLMG::BottomSolver::bicgstab`: The default. @@ -290,10 +291,32 @@ can be used to change the bottom solver. Available choices are - :cpp:`MLMG::BottomSolver::cgbicg`: Start with cg. Switch to bicgstab if cg fails. The matrix must be symmetric. -- :cpp:`MLMG::BottomSolver::hypre`: BoomerAMG in hypre. +- :cpp:`MLMG::BottomSolver::hypre`: One of the solvers available through hypre; see the +section below on External Solvers - :cpp:`MLMG::BottomSolver::petsc`: Currently for cell-centered only. +Boundary Stencils for Cell-Centered Solvers +=========================================== + +We have the option using the :cpp:`MLMG` member method + +.. highlight:: c++ + +:: + + void setMaxOrder (int maxorder); + +to set the order of the cell-centered linear operator stencil at physical boundaries +with Dirichlet boundary conditions and at coarse-fine boundaries. In both of these +cases, the boundary value is not defined at the center of the ghost cell. +The order determines the number of interior cells that are used in the extrapolation +of the boundary value from the cell face to the center of the ghost cell, where +the extrapolated value is then used in the regular stencil. For example, +:cpp:`maxorder = 2` uses the boundary value and the first interior value to extrapolate +to the ghost cell center; :cpp:`maxorder = 3` uses the boundary value and the first two interior values. + + Curvilinear Coordinates ======================= @@ -382,23 +405,106 @@ as living at face centroids, modify the setBCoeffs command to be External Solvers ================ -AMReX can use the `hypre `_ algebraic multigrid solver, BoomerAMG, -as a bottom solver for both cell-centered and node-based problems. -For challenging problems, our geometric multigrid solver may have difficulty solving, -whereas an algebraic multigrid method might be more robust. -We note that by default our solver always tries to geometrically coarsen the +AMReX provides interfaces to the `hypre `_ preconditioners and solvers, including BoomerAMG, GMRES (all variants), PCG, and BICGStab as +solvers, and BoomerAMG and Euclid as preconditioners. These can be called as +as bottom solvers for both cell-centered and node-based problems. + +By default the AMReX linear solver code always tries to geometrically coarsen the problem as much as possible. However, as we have mentioned, we can call :cpp:`setMaxCoarseningLevel(0)` on the :cpp:`LPInfo` object passed to the constructor of a linear operator to disable the coarsening completely. In that case the bottom solver is solving the -residual correction form of the original problem. +residual correction form of the original problem. To build Hypre, follow the next steps: + +.. highlight:: c++ + +:: + + 1.- git clone https://github.com/hypre-space/hypre.git + 2.- cd hypre/src + 3.- ./configure + (if you want to build hypre with long long int, do ./configure --enable-bigint ) + 4.- make install + 5.- Create an environment variable with the HYPRE directory -- + HYPRE_DIR=/hypre_path/hypre/src/hypre To use hypre, one must include ``amrex/Src/Extern/HYPRE`` in the build system. -For an example of using hypre, we refer the reader to -``Tutorials/LinearSolvers/ABecLaplacian_C``. +For examples of using hypre, we refer the reader to +``Tutorials/LinearSolvers/ABecLaplacian_C`` or ``Tutorials/LinearSolvers/NodalProjection_EB``. + +Caveat: to use hypre for the nodal solver, you must either build with USE_EB = TRUE, +or explicitly set the coarsening strategy in the calling routine to be ``RAP`` rather than ``Sigma`` +by adding + +.. highlight:: c++ + +:: + + nodal_projector.getLinOp().setCoarseningStrategy(MLNodeLaplacian::CoarseningStrategy::RAP); + +where +:cpp:`nodal_projector` is the :cpp:`NodalProjector` object we have built. + +The following parameter should be set to True if the problem to be solved has a singular matrix. +In this case, the solution is only defined to within a constant. Setting this parameter to True +replaces one row in the matrix sent to hypre from AMReX by a row that sets the value at one cell to 0. + +- :cpp:`hypre.adjust_singular_matrix`: Default is False. + + +The following parameters can be set in the inputs file to control the choice of preconditioner and smoother: + +- :cpp:`hypre.hypre_solver`: Default is BoomerAMG. + +- :cpp:`hypre.hypre_preconditioner`: Default is none; otherwise the type must be specified. + +- :cpp:`hypre.recompute_preconditioner`: Default true. Option to recompute the preconditioner. + +- :cpp:`hypre.write_matrix_files`: Default false. Option to write out matrix into text files. + +- :cpp:`hypre.overwrite_existing_matrix_files`: Default false. Option to over-write existing matrix files. + + +The following parameters can be set in the inputs file to control the BoomerAMG solver specifically: + +- :cpp:`hypre.bamg_verbose`: verbosity of BoomerAMG preconditioner. Default 0. See `HYPRE_BoomerAMGSetPrintLevel` + +- :cpp:`hypre.bamg_logging`: Default 0. See `HYPRE_BoomerAMGSetLogging` + +- :cpp:`hypre.bamg_coarsen_type`: Default 6. See `HYPRE_BoomerAMGSetCoarsenType` + +- :cpp:`hypre.bamg_cycle_type`: Default 1. See `HYPRE_BoomerAMGSetCycleType` + +- :cpp:`hypre.bamg_relax_type`: Default 6. See `HYPRE_BoomerAMGSetRelaxType` + +- :cpp:`hypre.bamg_relax_order`: Default 1. See `HYPRE_BoomerAMGSetRelaxOrder` + +- :cpp:`hypre.bamg_num_sweeps`: Default 2. See `HYPRE_BoomerAMGSetNumSweeps` + +- :cpp:`hypre.bamg_max_levels`: Default 20. See `HYPRE_BoomerAMGSetMaxLevels` + +- :cpp:`hypre.bamg_strong_threshold`: Default 0.25 for 2D, 0.57 for 3D. See `HYPRE_BoomerAMGSetStrongThreshold` + +- :cpp:`hypre.bamg_interp_type`: Default 0. See `HYPRE_BoomerAMGSetInterpType` + +The user is referred to the +`hypre `_ Hypre Reference Manual for full details on the usage of the parameters described briefly above. AMReX can also use `PETSc `_ as a bottom solver for cell-centered -problems. To use PETSc, one must include ``amrex/Src/Extern/PETSc`` +problems. To build PETSc, follow the next steps: + +.. highlight:: c++ + +:: + + 1.- git clone https://github.com/petsc/petsc.git + 2.- cd petsc + 3.- ./configure --download-hypre=yes --prefix=build_dir + 4.- Follow the steps given by petsc + 5.- Create an environment variable with the PETSC directory -- + PETSC_DIR=/petsc_path/petsc/build_dir + +To use PETSc, one must include ``amrex/Src/Extern/PETSc`` in the build system. For an example of using PETSc, we refer the reader to ``Tutorials/LinearSolvers/ABecLaplacian_C``. @@ -508,7 +614,7 @@ the MACProjector object and use it to perform a MAC projection. LinOpBCType::Periodic)}); macproj.setVerbose(mg_verbose); - macproj.setCGVerbose(cg_verbose); + macproj.setBottomVerbose(bottom_verbose); // Define the relative tolerance Real reltol = 1.e-8; @@ -665,10 +771,10 @@ gradient term to make the vector field result satisfy the divergence constraint. // We can specify the maximum number of iterations nodal_solver.setMaxIter(mg_maxiter); - nodal_solver.setCGMaxIter(mg_cg_maxiter); + nodal_solver.setBottomMaxIter(mg_bottom_maxiter); nodal_solver.setVerbose(mg_verbose); - nodal_solver.setCGVerbose(mg_cg_verbose); + nodal_solver.setBottomVerbose(mg_bottom_verbose); // Set bottom-solver to use hypre instead of native BiCGStab // ( we could also have set this to cg, bicgcg, cgbicg) diff --git a/Docs/sphinx_documentation/source/LoadBalancing.rst b/Docs/sphinx_documentation/source/LoadBalancing.rst index 85c6a17cec8..fa384ea1659 100644 --- a/Docs/sphinx_documentation/source/LoadBalancing.rst +++ b/Docs/sphinx_documentation/source/LoadBalancing.rst @@ -16,16 +16,41 @@ grid creation process is governed by trying to balance the work in each grid.) Single-level load balancing algorithms are sequentially applied to each AMR level independently, and the resulting distributions are mapped onto the ranks taking into account the weights -already assigned to them (assign heaviest set of grids to the least loaded rank) +already assigned to them (assign heaviest set of grids to the least loaded rank). Note that the +load of each process is measured by how much memory has already been allocated, not how much memory +will be allocated. Therefore the following code is not recommended because it tends to generate +non-optimal distributions. -Options supported by AMReX include the following; the default is SFC: +.. highlight:: c++ + +:: + + for (int lev = 0; lev < nlevels; ++lev) { + // build DistributionMapping for Level lev + } + for (int lev = 0; lev < nlevels; ++lev) { + // build MultiFabs for Level lev + } + +Instead, one should do, + +.. highlight:: c++ + +:: + + for (int lev = 0; lev < nlevels; ++lev) { + // build DistributionMapping for Level lev + // build MultiFabs for Level lev + } + +Distribution options supported by AMReX include the following; the default is SFC: - Knapsack: the default weight of a grid in the knapsack algorithm is the number of grid cells, but AMReX supports the option to pass an array of weights -- one per grid -- or alternatively - to pass in a MultiFab of weights per cell which is used to compute the weight per grid + to pass in a MultiFab of weights per cell which is used to compute the weight per grid. - SFC: enumerate grids with a space-filling Z-morton curve, then partition the - resulting ordering across ranks in a way that balances the load + resulting ordering across ranks in a way that balances the load. - Round-robin: sort grids and assign them to ranks in round-robin fashion -- specifically FAB i is owned by CPU i%N where N is the total number of MPI ranks. diff --git a/Docs/sphinx_documentation/source/Particle.rst b/Docs/sphinx_documentation/source/Particle.rst index 25c82cf223d..4d9746242b2 100644 --- a/Docs/sphinx_documentation/source/Particle.rst +++ b/Docs/sphinx_documentation/source/Particle.rst @@ -371,8 +371,8 @@ skipped. You can also access the SoA data using the :math:`ParIter` as follows: using MyParIter = ParIter<0, 0, 2, 2>; for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { auto& particle_attributes = pti.GetStructOfArrays(); - Vector& real_comp0 = particle_attributes.GetRealData(0); - Vector& int_comp1 = particle_attributes.GetIntData(1); + RealVector& real_comp0 = particle_attributes.GetRealData(0); + IntVector& int_comp1 = particle_attributes.GetIntData(1); for (int i = 0; i < pti.numParticles; ++i) { // do stuff with your SoA data... } @@ -434,7 +434,7 @@ both these sorts of operations. Ey.FillBoundary(gm.periodicity()); Ez.FillBoundary(gm.periodicity()); for (MyParIter pti(MyPC, lev); pti.isValid(); ++pti) { - const Box& box = Ex[pti].validBox(); + const Box& box = pti.validbox(); const auto& particles = pti.GetArrayOfStructs(); int nstride = particles.dataShape().first; @@ -474,7 +474,7 @@ is quite similar: rho.setVal(0.0, ng); for (MyParIter pti(*this, lev); pti.isValid(); ++pti) { - const Box& box = rho[pti].validbox(); + const Box& box = pti.validbox(); const auto& particles = pti.GetArrayOfStructs(); int nstride = particles.dataShape().first; diff --git a/Docs/sphinx_documentation/source/SUNDIALS.rst b/Docs/sphinx_documentation/source/SUNDIALS.rst new file mode 100644 index 00000000000..420002ec178 --- /dev/null +++ b/Docs/sphinx_documentation/source/SUNDIALS.rst @@ -0,0 +1,78 @@ +.. role:: cpp(code) + :language: c++ + +.. role:: fortran(code) + :language: fortran + + +Compiling AMReX with SUNDIALS 5 +=============================== + +The following steps describe how to compile an AMReX application with +SUNDIALS 5 support. + +In order to use SUNDIALS: + +#. AMReX suggests using the Github mirror: + https://github.com/LLNL/sundials + + :: + + #!/bin/bash + set -e + git clone https://github.com/LLNL/sundials + cd sundials + mkdir builddir instdir + INSTALL_PREFIX=$(pwd)/instdir + cd builddir + cmake \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \ + -DCMAKE_C_COMPILER=$(which gcc) \ + -DCMAKE_CXX_COMPILER=$(which g++) \ + -DCMAKE_CUDA_HOST_COMPILER=$(which g++) \ + -DEXAMPLES_INSTALL_PATH=${INSTALL_PREFIX}/examples \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS_RELEASE="-O3 -DNDEBUG" \ + -DCMAKE_CXX_FLAGS_RELEASE="-O3 -DNDEBUG" \ + -DCUDA_ENABLE=ON \ + -DMPI_ENABLE=OFF \ + -DOPENMP_ENABLE=ON \ + -DF2003_INTERFACE_ENABLE=ON \ + -DCUDA_ARCH=sm_70 ../ + make -j8 + make install -j8 + +#. Note that ``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` need to be consistent with the AMReX + make variable COMP to ensure matching OMP runtime libraries for use with the OpenMP NVector. + +#. ``CUDA_ARCH`` must be set to the appropriate value for the GPU being targeted + +#. For more detailed instructions for installing SUNDIALS with different flags and versions see + the `SUNDIALS documentation `_. + +#. In the ``GNUmakefile`` for the application which uses the interface to SUNDIALS, add + ``USE_SUNDIALS = TRUE`` and ``SUNDIALS_ROOT=${INSTALL_PREFIX}``. Note that one must define the + ``SUNDIALS_LIB_DIR`` make variable to point to the location where the libraries are installed + if they are not installed in the default location which is ``${INSTALL_PREFIX}/lib64``. + +#. If the application uses the SUNDIALS CVODE time integrator package, then the variable + ``USE_CVODE_LIBS = TRUE`` should also be added in the ``GNUmakefile`` for the application. + If the application used the SUNDIALS ARKode time integrator package, then the variable + ``USE_ARKODE_LIBS = TRUE`` should be added. + +#. Fortran 2003 interfaces for the pgi compilers are currently not supported. + + +Note that SUNDIALS can also be installed via Spack: + + :: + + spack install sundials+cuda+f2003+openmp + + +SUNDIALS 5 Tutorials +-------------------------- + +AMReX provides in the ``amrex/Tutorials/SUNDIALS`` directory. diff --git a/Docs/sphinx_documentation/source/SUNDIALS3.rst b/Docs/sphinx_documentation/source/SUNDIALS3.rst deleted file mode 100644 index 073e6bc3427..00000000000 --- a/Docs/sphinx_documentation/source/SUNDIALS3.rst +++ /dev/null @@ -1,98 +0,0 @@ -.. role:: cpp(code) - :language: c++ - -.. role:: fortran(code) - :language: fortran - - -Compiling AMReX with Sundials version 3.X or later ---------------------------------------------------- - -The following steps describe how to compile an AMReX application with -SUNDIALS_3.X support. On Cray systems (e.g., Cori or Edison at NERSC), Cray provides -a system module called ``cray-tpsl`` ("Cray Third-Party Scientific Libraries") -which as of this writing contains the 2.7 version of the SUNDIALS solver suite (including -CVODE). - -In order to use the Sundials 3.X version: - -#. Obtain the CVODE source code, which is hosted here: - https://computation.llnl.gov/projects/sundials/sundials-software. - One can download either the complete SUNDIALS package, or just the CVODE components. - -#. Unpack the CVODE / SUNDIALS tarball, and create a new "build" directory (it - can be anywhere). - -#. Navigate to the new, empty build directory, and type - - :: - - cmake \ - -DCMAKE_INSTALL_PREFIX:PATH=/path/to/install/dir \ - /path/to/cvode/or/sundials/top/level/source/dir - - - The ``CMAKE_INSTALL_DIR`` option tells CMake where to install the libraries. - Note that CMake will attempt to deduce the compilers automatically, but - respects certain environment variables if they are defined, such as ``CC`` - (for the C compiler), ``CXX`` (for the C++ compiler), and ``FC`` (for the - Fortran compiler). So one may modify the above CMake invocation to be - something like the following: - - :: - - CC=/path/to/gcc \ - CXX=/path/to/g++ \ - FC=/path/to/gfortran \ - cmake \ - -DCMAKE_INSTALL_PREFIX:PATH=/path/to/install/dir \ - /path/to/cvode/or/sundials/top/level/source/dir - - - One can supply additional flags to CMake or to the compiler to customize the - compilation process. Flags of interest may include ``CMAKE_C_FLAGS``, which - add the specified flags to the compile statement, e.g., - ``-DCMAKE_C_FLAGS="-h list=a"`` will append the ``-h list=a`` flag to the - ``cc`` statement when compiling the source code. Here one may wish to add - something like ``"-O2 -g"`` to provide an optimized library that still - contains debugging symbols; if one neglects debugging symbols in the CVODE - library, and if a code that uses CVODE encounters a segmentation fault in - the solve, then the backtrace has no information about where in the solver - the error occurred. Also, if one wishes to compile only the solver library - itself and not the examples that come with the source (compiling the - examples is enabled by default), one can add ``"-DEXAMPLES_ENABLE=OFF"``. - Users should be aware that the CVODE examples are linked dynamically, so - when compiling the solver library on Cray system using the Cray compiler - wrappers ``cc``, ``CC``, and ``ftn``, one should explicitly disable - compiling the examples via the ``"-DEXAMPLES_ENABLE=OFF"`` flag. - -#. In the ``GNUmakefile`` for the application which uses the Fortran 2003 - interface to CVODE or ARKODE, add ``SUNDIALS_3x4x = TRUE``, which will compile the Fortran 2003 - interfaces and link the libraries. Note that one must define the - ``CVODE_LIB_DIR`` environment variable to point to the location where the - libraries are installed. - -#. In the ``GNUmakefile`` for the application which uses the Fortran 2003 - interface to ARKODE, also add ``USE_ARKODE_LIBS = TRUE``. It is assumed that the - ``CVODE_LIB_DIR`` environment variable points to the location where the ARKODE - libraries are installed as well. - -#. Fortran 2003 interfaces for the pgi compilers and for developmental versions of SUNDIALS - are currently not supported. - -SUNDIALS 3.X Tutorials -------------------------- - -AMReX provides six tutorials in the ``amrex/Tutorials/CVODE/SUNDIALS3_finterface`` directory. -``EX1`` is modeled after the CVODE Tutorial ``EX1`` showing use with AMReX. -The four ``EX_cv_*`` tutorials are based on examples provided with the interface, which -are more closely modeled after CVODE examples. The ``EX_ark_analytic_fp`` tutorial is based -on the ``EX_cv_analytic_fp`` tutorial, but uses ARKODE instead of CVODE. - -AMReX provides three tutorials in the ``amrex/Tutorials/CVODE/SUNDIALS3_cppversion`` directory. -These are versions of ``EX1`` which operate on a packed version of the data. ``EX1_SERIAL_NVEC`` -packs a box worth of equations into a serial NVector, uses CVODE to solve, and then unpacks -the solution back into the box it came from. ``EX1_CUDA_NVEC`` uses the cuda NVector implementation instead. -``EX1_GPU_PRAGMA`` uses the cuda NVector, and the gpu pragma functionality. - -.. _SUNDIALS3: diff --git a/Docs/sphinx_documentation/source/CVODE.rst b/Docs/sphinx_documentation/source/SUNDIALS_CVODE.rst similarity index 95% rename from Docs/sphinx_documentation/source/CVODE.rst rename to Docs/sphinx_documentation/source/SUNDIALS_CVODE.rst index 3f90028401e..eb38fe608b1 100644 --- a/Docs/sphinx_documentation/source/CVODE.rst +++ b/Docs/sphinx_documentation/source/SUNDIALS_CVODE.rst @@ -5,8 +5,8 @@ :language: fortran -Compiling AMReX with CVODE (Cray or Sundials version 2.7) ------------------------------------------------------------ +Compiling AMReX with CVODE 2.7 +============================== The following steps describe how to compile an AMReX application with CVODE support. On Cray systems (e.g., Cori or Edison at NERSC), Cray provides @@ -73,8 +73,9 @@ On systems which are not Cray: ``CVODE_LIB_DIR`` environment variable to point to the location where the libraries are installed. -CVODE Tutorials ------------------- + +CVODE 2.7 Tutorials +------------------- AMReX provides two CVODE tutorials in the ``amrex/Tutorials/CVODE`` directory, called ``EX1`` and ``EX2``. See the Tutorials CVODE_ documentation for more detail. diff --git a/Docs/sphinx_documentation/source/SUNDIALS_top.rst b/Docs/sphinx_documentation/source/SUNDIALS_top.rst new file mode 100644 index 00000000000..9eaa3b596d3 --- /dev/null +++ b/Docs/sphinx_documentation/source/SUNDIALS_top.rst @@ -0,0 +1,36 @@ +.. _Chap:SUNDIALS: + +SUNDIALS +======== + +AMReX supports local ODE integration using the ARKode [1]_ and CVODE [2]_ +time integrators which are part of the SUNDIALS framework [3]_. ARKode +and CVODE contains solvers for stiff and non-stiff ODEs, and as such they +are well suited for solving e.g., the complex chemistry networks in combustion +simulations, or the nuclear reaction networks in astrophysical simulations. + +Most of SUNDIALS is written in C, but it is distributed with Fortran +interfaces that use the ``iso_c_binding`` feature of the Fortran 2003 standard. +AMReX supports these Fortran 2003 interfaces and they are used in the AMReX +SUNDIALS 5 tutorials. + +AMReX currently supports SUNDIALS version 5, and for CVODE only, a legacy +interface to SUNDIALS 2.7 which is the version available in the ``cray-tpsl`` +system module made available on Cray systems. + + +.. toctree:: + :maxdepth: 2 + + SUNDIALS + SUNDIALS_CVODE + +.. [1] + https://computation.llnl.gov/projects/sundials/arkode + +.. [2] + https://computation.llnl.gov/projects/sundials/cvode + +.. [3] + https://computation.llnl.gov/projects/sundials + diff --git a/Docs/sphinx_documentation/source/Testing.rst b/Docs/sphinx_documentation/source/Testing.rst index e2994931010..58ee2420a7d 100644 --- a/Docs/sphinx_documentation/source/Testing.rst +++ b/Docs/sphinx_documentation/source/Testing.rst @@ -119,6 +119,6 @@ do is start from an existing test and modify it. For example, this entry: defines a test called :cpp:`MLMG_FI_PoisCom` by specifying the apppropriate build directory, inputs file, and a set of configuration options. The above options are the most commonly changed; for a full list -of options, see the example configuration file at https://github.com/AMReX-Codes/regression_testing/blob/master/example-tests.ini. +of options, see the example configuration file at https://github.com/AMReX-Codes/regression_testing/blob/main/example-tests.ini. diff --git a/Docs/sphinx_documentation/source/Visualization.rst b/Docs/sphinx_documentation/source/Visualization.rst index c87d4a8e848..56c9982d8e3 100644 --- a/Docs/sphinx_documentation/source/Visualization.rst +++ b/Docs/sphinx_documentation/source/Visualization.rst @@ -755,7 +755,7 @@ and point to the CMake configuration installed with SENSEI. .. code-block:: bash - cmake -DENABLE_SENSEI=ON -DSENSEI_DIR=/lib/cmake .. + cmake -DAMReX_SENSEI=ON -DSENSEI_DIR=/lib/cmake .. When CMake generates the make files proceed as usual. diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst index a7247a8ab23..3ec3e895474 100644 --- a/Docs/sphinx_documentation/source/index.rst +++ b/Docs/sphinx_documentation/source/index.rst @@ -12,8 +12,8 @@ AMReX is developed at LBNL, NREL, and ANL as part of the Block-Structured AMR Co-Design Center in DOE's Exascale Computing Project. All of AMReX's development is done in the github repository under the -development branch; anyone can see the latest updates. Changes are merged into -the master branch at the beginning of each month. +development branch; anyone can see the latest updates. A monthly release is tagged at the beginning +of each month. We are always happy to have users contribute to the AMReX source code. To contribute, issue a pull request against the development branch (details `here @@ -42,7 +42,6 @@ Documentation on migration from BoxLib is available in the AMReX repository at D ManagingGridHierarchy_Chapter AmrCore_Chapter AmrLevel_Chapter - AsyncIter_Chapter ForkJoin IO_Chapter LinearSolvers_Chapter diff --git a/Docs/sphinx_tutorials/source/AMR_Tutorial.rst b/Docs/sphinx_tutorials/source/AMR_Tutorial.rst index aa50376ca25..dfdecaec29b 100644 --- a/Docs/sphinx_tutorials/source/AMR_Tutorial.rst +++ b/Docs/sphinx_tutorials/source/AMR_Tutorial.rst @@ -17,8 +17,11 @@ or with VisIt. Advection_AmrCore: This tutorial contains an AMR advection code that advects a single scalar field with a velocity field that is specified on faces. +It is written entirely in C++, can be built in 2D or 3D and run with the same +inputs file, -It is an AMReX based code designed to run in parallel using MPI/OMP. +It is an AMReX-based code designed to run in parallel using MPI+X, where X +may be OMP for multicore machines and CUDA for hybrid CPU/GPU architectures. This example uses source code from the amrex/Src/Base, Boundary, and AmrCore directories. @@ -26,7 +29,7 @@ directories. Notably, this example does not use source code from amrex/Src/Amr (see the tutorial Advection_AmrLevel). -The directory Exec/SingleVortex includes a makefile and a sample inputs file. +The directory Exec includes a makefile and a sample inputs file. **Advection_AmrLevel** ---------------------- diff --git a/Docs/sphinx_tutorials/source/CVODE_Tutorial.rst b/Docs/sphinx_tutorials/source/CVODE_Tutorial.rst index 0599d937e35..4d511f8a1bc 100644 --- a/Docs/sphinx_tutorials/source/CVODE_Tutorial.rst +++ b/Docs/sphinx_tutorials/source/CVODE_Tutorial.rst @@ -8,15 +8,15 @@ Tutorials/CVODE ========================== There are two CVODE tutorials in the ``amrex/Tutorials/CVODE`` directory, called -``EX1`` and ``EX2``. ``EX1`` consists of a single ODE that is integrated with +``EX1_F`` and ``EX2_F``. ``EX1_F`` consists of a single ODE that is integrated with CVODE within each cell of a 3-D grid. It demonstrates how to initialize the CVODE solver, how to call the ODE right-hand-side (RHS), and, more importantly, how to *re-*\ initialize the solver between cells, which avoids allocating and freeing solver memory between each cell (see the call to ``FCVReInit()`` in the -``integrate_ode.f90`` file in the ``EX1`` directory.) +``integrate_ode.f90`` file in the ``EX1_F`` directory.) -The ``EX2`` example demonstrates the slightly more complicated case of -integrating a system of coupled ODEs within each cell. Similarly to ``EX1``, +The ``EX2_F`` example demonstrates the slightly more complicated case of +integrating a system of coupled ODEs within each cell. Similarly to ``EX1_F``, it provides an RHS and some solver initialization. However, it also demonstrates the performance effect of providing an analytic Jacobian matrix for the system of ODEs, rather than requiring the solver to compute the diff --git a/Docs/sphinx_tutorials/source/SENSEI_Tutorial.rst b/Docs/sphinx_tutorials/source/SENSEI_Tutorial.rst deleted file mode 100644 index e74ce4c87d1..00000000000 --- a/Docs/sphinx_tutorials/source/SENSEI_Tutorial.rst +++ /dev/null @@ -1,215 +0,0 @@ -.. role:: cpp(code) - :language: c++ - -.. role:: fortran(code) - :language: fortran - -Tutorials/SENSEI -========================== - -SENSEI is a middleware that allows one to send data to various visualization and -analysis back ends through a uniform interface. It's data model and API enable -one to chose the desired visualization and analysis back end for a given task -with out limitting ones options, as the back ends can be inter-changed at run -time via a text based config file. - -Configuring the environment at NERSC ------------------------------------- - -First select the desired SENSEI install. Each install will support different set of -backends. This is necessary because not all of the back ends are compatible with -each other. - -For instance to use SENSEI with ParaView Catalyst: - -.. highlight:: shell - -:: - - module load sensei/2.1.0-catalyst - - -To use SENSEI with VisIt Libsim: - -.. highlight:: shell - -:: - - - module load sensei/2.1.0-libsim - - -SENSEI features in AMReX are conditionally compiled when the Make file variable -``USE_SENSEI_INSITU`` is set. When this variable is set, the Make file will query -environment variables to determine the list of include directories and link -libraries needed to compile with SENSEI. - -The ``sensei_config`` tool that is installed with SENSEI will set the environment -variables that are used in the make files. - -With a SENSEI module loaded, in a bash shell: - -.. highlight:: shell - -:: - - - source sensei_config - - -There are two SENSEI tutorials included with AMReX, for for use with ``AmrCore``, and one -for ``AmrLevel``. - - -Compiling and Running the ``Advection_AmrCore`` tutorial --------------------------------------------------------- - -This example uses source code from the amrex/Src/Base, Boundary, and AmrCore directories. -Notably, this example does not use source code from amrex/Src/Amr -(see the tutorial Advection_AmrLevel). - -The directory Exec/SingleVortex in Tutorials/SENSEI/Advection_AmrCore -includes a makefile and a sample inputs file. -Plotfiles are generated that can be viewed with amrvis2d / amrvis3d -(CCSE's native vis / spreadsheet tool, downloadable separately from ccse.lbl.gov) -or with VisIt. - - -Edit the file ``Exec/SingleVortex/GNUmakefile``, set - -.. highlight:: shell - -:: - - - USE_SENSEI_INSITU = TRUE - - -Build the tutorial - -.. highlight:: shell - -:: - - - make -j4 - - -To use SENSEI in AMReX one needs to enable it via ParmParse input file. -Additionally one needs to provide a SENSEI XML configuration that selects -and configures the desired SENSEI backend. - -Example XML configs are included in ``Exec/SingleVortex/SENSEI``. - -Edit the file ``Exec/SingleVortex/inputs`` - -Running with ParaView Catalyst: - -.. highlight:: shell - -:: - - - sensei.enabled = 1 # turn SENSEI in situ on/off - sensei.config = SENSEI/render_catalyst.xml # render simulation data with ParaView Catalyst - sensei.frequency = 1 # number of level 0 steps between in situ processing - - -Running with VisIt Libsim: - -.. highlight:: shell - -:: - - - sensei.enabled = 1 # turn SENSEI in situ on/off - sensei.config = SENSEI/render_libsim.xml # render simulation data with VisIt Libsim - sensei.frequency = 1 # number of level 0 steps between in situ processing - - -Once the inputs files has been edited, run the execcutable as usual - -.. highlight:: shell - -:: - - - mpiexec -np 4 ./main2d.gnu.MPI.ex inputs - - - -Compiling and Running the ``Advection_AmrLevel`` tutorial ---------------------------------------------------------- - -This example uses source code from the amrex/Src/Base, Boundary, Amrlevel, and -Amr directories. - -The directories Exec/SingleVortex and Exec/UniformVelocity in Tutorials/SENSEI/Advection_AmrLevel -each include a makefile and a sample inputs file. -Plotfiles are generated that can be viewed with amrvis2d / amrvis3d -(CCSE's native vis / spreadsheet tool, downloadable separately from ccse.lbl.gov) -or with VisIt. - -Edit the file ``Exec/SingleVortex/GNUmakefile``, set - -.. highlight:: shell - -:: - - - USE_SENSEI_INSITU = TRUE - - -Finally, make the tutorial - -.. highlight:: shell - -:: - - - make -j4 - - -## Running ## -To use SENSEI in AMReX one needs to enable it via ParmParse input file. -Additionally one needs to provide a SENSEI XML configuration that selects -and configures the desired SENSEI backend. - -Example XML configs are included in ``Exec/SingleVortex/SENSEI``. - -Edit the file ``Exec/SingleVortex/inputs`` - - -Running with ParaView Catalyst: - -.. highlight:: shell - -:: - - - sensei.enabled = 1 # turn SENSEI in situ on/off - sensei.config = SENSEI/render_catalyst.xml # render simulation data with ParaView Catalyst - sensei.frequency = 1 # number of level 0 steps between in situ processing - - -Running with VisIt Libsim: - -.. highlight:: shell - -:: - - - sensei.enabled = 1 # turn SENSEI in situ on/off - sensei.config = SENSEI/render_libsim.xml # render simulation data with VisIt Libsim - sensei.frequency = 1 # number of level 0 steps between in situ processing - - -Once the inputs files has been edited, run the execcutable as usual - -.. highlight:: shell - -:: - - - mpiexec -np 4 ./main2d.gnu.MPI.ex inputs - diff --git a/Docs/sphinx_tutorials/source/SUNDIALS_Tutorial.rst b/Docs/sphinx_tutorials/source/SUNDIALS_Tutorial.rst new file mode 100644 index 00000000000..f97e376801a --- /dev/null +++ b/Docs/sphinx_tutorials/source/SUNDIALS_Tutorial.rst @@ -0,0 +1,21 @@ +.. role:: cpp(code) + :language: c++ + +.. role:: fortran(code) + :language: fortran + +Tutorials/SUNDIALS +========================== + +AMReX provides five tutorials in the ``amrex/Tutorials/SUNDIALS`` directory. +There are three versions of ``EX1`` which parallelize differently. ``EX1_C`` +packs a box worth of equations into a serial NVector, uses CVODE to solve, and then unpacks +the solution back into the box it came from. ``EX1_CUDA`` uses the cuda NVector implementation +instead. ``EX1_F`` parallelizes over the cells individually. ``EX2_F`` is based on +``fcvRoberts_dns.f`` example code in CVODE. ``EX-CUSOLVER`` uses a Castro-style driver and +tests different ode solving configurations. + +See the SUNDIALS_ section of the AMReX documentation for general instructions +on how to include SUNDIALS in an AMReX application. + +.. _SUNDIALS: https://amrex-codes.github.io/amrex/docs_html/SUNDIALS.html diff --git a/Docs/sphinx_tutorials/source/index.rst b/Docs/sphinx_tutorials/source/index.rst index a9acb82c044..7368b45b5ef 100644 --- a/Docs/sphinx_tutorials/source/index.rst +++ b/Docs/sphinx_tutorials/source/index.rst @@ -35,7 +35,6 @@ The amrex/Tutorials directory is broken into the following categories: MUI_Tutorial Particles_Tutorial SDC_Tutorial - SENSEI_Tutorial SWFFT_Tutorial Indices and tables diff --git a/Src/Amr/AMReX_Amr.H b/Src/Amr/AMReX_Amr.H index 6a7432c3ec7..90e54a53fd3 100644 --- a/Src/Amr/AMReX_Amr.H +++ b/Src/Amr/AMReX_Amr.H @@ -15,19 +15,11 @@ #include -#ifdef USE_PERILLA -#include -#include -#endif - namespace amrex { class AmrLevel; class LevelBld; class BoxDomain; -template -class MFGraph; -class AmrTask; #if defined(BL_USE_SENSEI_INSITU) class AmrInSituBridge; #endif @@ -43,9 +35,6 @@ class AmrInSituBridge; class Amr : public AmrCore { - template - friend class MFGraph; - friend class AmrTask; typedef std::multimap< std::pair, double > BoundaryPointList; public: @@ -189,6 +178,8 @@ public: static void fillDerivePlotVarList (); static void fillDeriveSmallPlotVarList (); + static void setComputeNewDtOnRegrid (int flag) { compute_new_dt_on_regrid = flag; } + static void Initialize (); static void Finalize (); //! AmrLevel lev. @@ -274,7 +265,7 @@ public: intersect_hix = IntersectHiX; intersect_loy = IntersectLoY; intersect_hiy = IntersectHiY; - }; + } /** * \brief More general version: @@ -296,32 +287,32 @@ public: intersect_hiy = IntersectHiY; intersect_loz = IntersectLoZ; intersect_hiz = IntersectHiZ; - }; + } BoundaryPointList& getIntersectLoX() noexcept { return intersect_lox; - }; + } BoundaryPointList& getIntersectHiX() noexcept { return intersect_hix; - }; + } BoundaryPointList& getIntersectLoY() noexcept { return intersect_loy; - }; + } BoundaryPointList& getIntersectHiY() noexcept { return intersect_hiy; - }; + } BoundaryPointList& getIntersectLoZ() noexcept { return intersect_loz; - }; + } BoundaryPointList& getIntersectHiZ() noexcept { return intersect_hiz; - }; + } #ifdef AMREX_PARTICLES //! Redistribute particles @@ -374,14 +365,14 @@ protected: Real stop_time); // pure virtural function in AmrCore - virtual void MakeNewLevelFromScratch (int lev, Real time, const BoxArray& ba, const DistributionMapping& dm) override - { amrex::Abort("How did we get her!"); } - virtual void MakeNewLevelFromCoarse (int lev, Real time, const BoxArray& ba, const DistributionMapping& dm) override - { amrex::Abort("How did we get her!"); } - virtual void RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMapping& dm) override - { amrex::Abort("How did we get her!"); } - virtual void ClearLevel (int lev) override - { amrex::Abort("How did we get her!"); } + virtual void MakeNewLevelFromScratch (int /*lev*/, Real /*time*/, const BoxArray& /*ba*/, const DistributionMapping& /*dm*/) override + { amrex::Abort("How did we get here!"); } + virtual void MakeNewLevelFromCoarse (int /*lev*/, Real /*time*/, const BoxArray& /*ba*/, const DistributionMapping& /*dm*/) override + { amrex::Abort("How did we get here!"); } + virtual void RemakeLevel (int /*lev*/, Real /*time*/, const BoxArray& /*ba*/, const DistributionMapping& /*dm*/) override + { amrex::Abort("How did we get here!"); } + virtual void ClearLevel (int /*lev*/) override + { amrex::Abort("How did we get here!"); } //! Whether to write a plotfile now bool writePlotNow () noexcept; @@ -475,18 +466,13 @@ protected: static Vector initial_ba; //! Array of BoxArrays read in to externally define grid hierarchy at each regrid static Vector regrid_ba; + static int compute_new_dt_on_regrid; #if defined(BL_USE_SENSEI_INSITU) static AmrInSituBridge *insitu_bridge; #endif public: -#ifdef USE_PERILLA - std::vector > graphArray; - std::vector amrGraphArray; - std::vector &get_graphArray(int level){return graphArray[level];} -#endif - BoundaryPointList intersect_lox; BoundaryPointList intersect_loy; BoundaryPointList intersect_loz; diff --git a/Src/Amr/AMReX_Amr.cpp b/Src/Amr/AMReX_Amr.cpp index bf6f20eda89..db8e7942511 100644 --- a/Src/Amr/AMReX_Amr.cpp +++ b/Src/Amr/AMReX_Amr.cpp @@ -12,10 +12,6 @@ #include #endif -#include -#include -#include - #include #include #include @@ -48,29 +44,6 @@ #include #endif -#ifdef USE_PERILLA -#include -#include -#ifdef USE_PERILLA -//#ifndef USE_PERILLA_ON_DEMAND - pthread_mutex_t teamFinishLock=PTHREAD_MUTEX_INITIALIZER; -//#endif -#ifdef PERILLA_USE_UPCXX -extern struct rMsgMap_t{ - std::map< int, std::map< int, std::list< Package* > > > map; - volatile int size=0; - pthread_mutex_t lock= PTHREAD_MUTEX_INITIALIZER; -}rMsgMap; -extern struct sMsgMap_t{ - std::map< int, std::map< int, std::list< Package* > > > map; - volatile int size=0; - pthread_mutex_t lock= PTHREAD_MUTEX_INITIALIZER; -}sMsgMap; - -#endif -#endif -#endif - #ifdef BL_USE_SENSEI_INSITU #include #endif @@ -88,6 +61,7 @@ bool Amr::first_plotfile; bool Amr::first_smallplotfile; Vector Amr::initial_ba; Vector Amr::regrid_ba; +int Amr::compute_new_dt_on_regrid; #ifdef BL_USE_SENSEI_INSITU AmrInSituBridge* Amr::insitu_bridge; #endif @@ -97,11 +71,7 @@ namespace const std::string CheckPointVersion("CheckPointVersion_1.0"); bool initialized = false; -} -//Tan Nov 24, 2017 : I removed this anonymous namespace so I could access the inner variables from other source files -//namespace -//{ // // These are all ParmParse'd in. Set defaults in Initialize()!!! // @@ -116,12 +86,11 @@ namespace int insitu_on_restart; int checkpoint_on_restart; bool checkpoint_files_output; - int compute_new_dt_on_regrid; bool precreateDirectories; bool prereadFAHeaders; VisMF::Header::Version plot_headerversion(VisMF::Header::Version_v1); VisMF::Header::Version checkpoint_headerversion(VisMF::Header::Version_v1); -//} +} @@ -1277,7 +1246,7 @@ Amr::initialInit (Real strt_time, void Amr::InitializeInit(Real strt_time, - Real stop_time, + Real /*stop_time*/, const BoxArray* lev0_grids, const Vector* pmap) { @@ -1402,7 +1371,10 @@ Amr::restart (const std::string& filename) // int linit = false; - readProbinFile(linit); + if (!probin_file.empty()) { + readProbinFile(linit); + } + // // Start calculation from given restart file. // @@ -1906,11 +1878,6 @@ Amr::timeStep (int level, int niter, Real stop_time) { -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - perilla::syncAllWorkerThreads(); - if(perilla::isMasterThread()) - { -#endif BL_PROFILE("Amr::timeStep()"); BL_COMM_PROFILE_NAMETAG("Amr::timeStep TOP"); @@ -1932,38 +1899,12 @@ Amr::timeStep (int level, else { int lev_top = std::min(finest_level, max_level-1); - -#ifdef USE_PERILLA - int cnt=0; - bool *metadataChanged=new bool[finest_level+1]; - for (int l=0; l <= finest_level; l++) - metadataChanged[l]=false; -#endif - for (int i(level); i <= lev_top; ++i) { const int old_finest = finest_level; if (okToRegrid(i)) { -#ifdef USE_PERILLA -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - //ask the communication thread to stop so that I can update the metadata - Perilla::updateMetadata_request=1; - while(!Perilla::updateMetadata_noticed){ - - } -#endif - //for (int k(i>0?i-1:0); k <= finest_level; ++k) { - for (int k=0; k <= finest_level; ++k) { - if(metadataChanged[k]==false){ - graphArray[k].clear(); - getLevel(k).finalizePerilla(time); - metadataChanged[k]=true; - cnt++; - } - } -#endif regrid(i,time); // @@ -2002,22 +1943,6 @@ Amr::timeStep (int level, lev_top = std::min(finest_level, max_level - 1); } } -#ifdef USE_PERILLA - if(cnt){ - if(ParallelDescriptor::NProcs()>1){ - Perilla::clearTagMap(); - Perilla::clearMyTagMap(); - Perilla::genTags=true; - Perilla::uTags=0; - Perilla::pTagCnt.clear(); - } - for(int i=0; i<= finest_level; i++){ - getLevel(i).initPerilla(cumtime); - } - Perilla::updateMetadata_done++; - } - delete [] metadataChanged; -#endif if (max_level == 0 && loadbalance_level0_int > 0 && loadbalance_with_workestimates) { @@ -2045,21 +1970,9 @@ Amr::timeStep (int level, << "ADVANCE with dt = " << dt_level[level] << "\n"; } -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - } - perilla::syncAllWorkerThreads(); -#endif - - BL_PROFILE_REGION_START("amr_level.advance"); Real dt_new = amr_level[level]->advance(time,dt_level[level],iteration,niter); BL_PROFILE_REGION_STOP("amr_level.advance"); -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - perilla::syncAllWorkerThreads(); - if(perilla::isMasterThread()) - { -#endif - dt_min[level] = iteration == 1 ? dt_new : std::min(dt_min[level],dt_new); level_steps[level]++; @@ -2089,17 +2002,8 @@ Amr::timeStep (int level, dt_level[k] = dt_level[k-1] / n_cycle[k]; } } -#ifdef USE_PERILLA -// getLevel(level).finalizePerilla(cumtime); -// getLevel(level).initPerilla(cumtime); -#endif } -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - } - perilla::syncAllWorkerThreads(); -#endif - // // Advance grids at higher level. // @@ -2122,23 +2026,10 @@ Amr::timeStep (int level, } } -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - perilla::syncAllWorkerThreads(); -#endif - amr_level[level]->post_timestep(iteration); -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - perilla::syncAllWorkerThreads(); - if(perilla::isMasterThread()) - { -#endif // Set this back to negative so we know whether we are in fact in this routine which_level_being_advanced = -1; -#if defined(USE_PERILLA_PTHREADS) || defined(USE_PERILLA_OMP) - } - perilla::syncAllWorkerThreads(); -#endif } Real @@ -2153,13 +2044,6 @@ Amr::coarseTimeStep (Real stop_time) { Real run_stop; Real run_strt; -#ifdef USE_PERILLA_PTHREADS - //mpi+pthreads (default) or upcxx+pthreads - std::vector flattenedGraphArray; - perilla::syncAllThreads(); - if(perilla::isMasterThread()) - { -#endif BL_PROFILE_REGION_START("Amr::coarseTimeStep()"); BL_PROFILE("Amr::coarseTimeStep()"); std::stringstream stepName; @@ -2193,189 +2077,7 @@ Amr::coarseTimeStep (Real stop_time) } BL_PROFILE_REGION_START(stepName.str()); - -#ifdef USE_PERILLA -#ifdef USE_PERILLA_PTHREADS - //mpi+pthreads (default) or upcxx+pthreads - } - perilla::syncAllThreads(); - - if(perilla::isMasterThread()){ - Perilla::updateMetadata_request = 0; - Perilla::updateMetadata_noticed = 0; - Perilla::updateMetadata_done = 0; - Perilla::numTeamsFinished = 0; - RegionGraph::graphCnt = 0; - if(levelSteps(0)==0){ - graphArray.resize(finest_level+1); - for(int i=0; i<= finest_level; i++) - getLevel(i).initPerilla(cumtime); - if(ParallelDescriptor::NProcs()>1){ - Perilla::syncProcesses(); - Perilla::communicateTags(); - Perilla::syncProcesses(); - } - } - } - perilla::syncAllThreads(); - - if(perilla::isCommunicationThread()) - { - Perilla::flattenGraphHierarchy(graphArray, flattenedGraphArray); - bool doublechecked=false; - while(true){ - if(!Perilla::updateMetadata_request){ - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - Perilla::syncProcesses(); - flattenedGraphArray.clear(); - Perilla::syncProcesses(); - break; - } - }else{ - Perilla::syncProcesses(); - for(int g=0; ggraphTeardown(); - } -#ifdef PERILLA_USE_UPCXX - pthread_mutex_lock(&(rMsgMap.lock)); - for(int i=0; i0){ - rMsgMap.map[i][j].pop_front(); - rMsgMap.size--; - } - } - } - pthread_mutex_unlock(&(rMsgMap.lock)); - while(sMsgMap.size>0){ - } -#endif - Perilla::syncProcesses(); - Perilla::updateMetadata_noticed=1; - while(Perilla::updateMetadata_done==0){//!= (max_level+1)){ - - } - Perilla::updateMetadata_request=0; - Perilla::updateMetadata_noticed=0; - Perilla::updateMetadata_done=0; - if(ParallelDescriptor::NProcs()>1){ - Perilla::syncProcesses(); - Perilla::communicateTags(); - Perilla::syncProcesses(); - } - flattenedGraphArray.clear(); - Perilla::flattenGraphHierarchy(graphArray, flattenedGraphArray); - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - Perilla::syncProcesses(); - flattenedGraphArray.clear(); - Perilla::syncProcesses(); - break; - } - } - } - }else{ - timeStep(0,cumtime,1,1,stop_time); - if(perilla::isMasterWorkerThread()){ - pthread_mutex_lock(&teamFinishLock); - Perilla::numTeamsFinished++; - pthread_mutex_unlock(&teamFinishLock); - } - } - - perilla::syncAllThreads(); - if(perilla::isMasterThread()){ - if(!okToContinue() || (level_steps[0] == Perilla::max_step) || (stop_time -(dt_level[0] + cumTime())<=0)){ - for(int i=0; i<= finest_level; i++){ - getLevel(i).finalizePerilla(cumtime); - } - } - } -#else - Perilla::numTeamsFinished = 0; - RegionGraph::graphCnt = 0; - if(levelSteps(0)==0){ - graphArray.resize(finest_level+1); - for(int i=0; i<= finest_level; i++) - getLevel(i).initPerilla(cumtime); - if(ParallelDescriptor::NProcs()>1){ - Perilla::communicateTags(); - } - } - Perilla::syncProcesses(); - -#ifdef USE_PERILLA_OMP -// int nThreads= perilla::NUM_THREAD_TEAMS * perilla::NUM_THREADS_PER_TEAM; -// num_threads(nThreads) -#pragma omp parallel default(shared) - { - if(perilla::isCommunicationThread()) - { - std::vector flattenedGraphArray; - while(true){ - Perilla::flattenGraphHierarchy(graphArray, flattenedGraphArray); - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - //perilla::syncWorkers(); - //if(perilla::wid()==0){ - //Perilla::syncProcesses(); - /*for(int g=0; ggraphTeardown(); - }*/ - //} - flattenedGraphArray.clear(); - //perilla::syncWorkers(); - if(perilla::wid()==0) Perilla::syncProcesses(); - break; - } - } - } - else{ - timeStep(0,cumtime,1,1,stop_time); - if(perilla::isMasterWorkerThread()){ - #pragma omp atomic - Perilla::numTeamsFinished++; - } - } - } -#elif defined(USE_PERILLA_ON_DEMAND) - //RTS on-demand - timeStep(0,cumtime,1,1,stop_time); -#else - cout<<"Undefined Async Mode"<* pmap) { + amrex::ignore_unused(pmap); + BL_PROFILE("Amr::defBaseLevel()"); // Just initialize this here for the heck of it which_level_being_advanced = -1; @@ -2847,9 +2545,12 @@ Amr::regrid (int lbase, // // Reclaim old-time grid space for all remain levels > lbase. + // But skip this if we're in the middle of a post-timestep regrid. // for(int lev = start; lev <= finest_level; ++lev) { - amr_level[lev]->removeOldData(); + if (!amr_level[lev]->postStepRegrid()) { + amr_level[lev]->removeOldData(); + } } // // Reclaim all remaining storage for levels > new_finest. @@ -3107,7 +2808,7 @@ Amr::printGridInfo (std::ostream& os, int numgrid = bs.size(); Long ncells = amr_level[lev]->countCells(); double ntot = Geom(lev).Domain().d_numPts(); - Real frac = 100.0_rt*(Real(ncells) / ntot); + Real frac = Real(100.0)*(Real(ncells) / ntot); const DistributionMapping& map = amr_level[lev]->get_new_data(0).DistributionMap(); os << " Level " diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H index c2b6741068b..16f142b1daa 100644 --- a/Src/Amr/AMReX_AmrLevel.H +++ b/Src/Amr/AMReX_AmrLevel.H @@ -27,9 +27,6 @@ namespace amrex { class TagBox; class TagBoxArray; -template -class MFGraph; -class RGIter; /** * \brief Virtual base class for managing individual levels. @@ -42,9 +39,6 @@ class AmrLevel friend class Amr; friend class FillPatchIterator; friend class FillPatchIteratorHelper; - template friend class MFGraph; - friend class RGIter; - friend class AsyncFillPatchIterator; public: //! What time are we at? @@ -102,9 +96,9 @@ public: * Unlike writePlotFile, this is NOT a pure virtual function * so implementation by derived classes is optional. */ - virtual void writeSmallPlotFile (const std::string& dir, - std::ostream& os, - VisMF::How how = VisMF::NFiles) {}; + virtual void writeSmallPlotFile (const std::string& /*dir*/, + std::ostream& /*os*/, + VisMF::How /*how*/ = VisMF::NFiles) {} //! Write current state to checkpoint file. virtual void checkPoint (const std::string& dir, std::ostream& os, @@ -162,13 +156,6 @@ public: int iteration, int ncycle) = 0; -#ifdef USE_PERILLA - // For Perilla initialization - virtual void initPerilla (Real time)=0; - virtual void finalizePerilla (Real time)=0; -#endif - - /** * \brief Contains operations to be done after a timestep. This is a * pure virtual function and hence MUST be implemented by derived @@ -183,7 +170,7 @@ public: /** * \brief Operations to be done after restart. */ - virtual void post_restart () {}; + virtual void post_restart () {} /** * \brief Operations to be done after regridding * This is a pure virtual function and hence MUST be @@ -380,7 +367,7 @@ public: #ifdef AMREX_PARTICLES //! This function can be called from the parent - virtual void particle_redistribute (int lbase = 0, bool a_init = false) {;} + virtual void particle_redistribute (int /*lbase*/ = 0, bool /*a_init*/ = false) {;} #endif static void FillPatch (AmrLevel& amrlevel, @@ -498,111 +485,7 @@ class FillPatchIterator MultiFab& get_mf() noexcept { return m_fabs; } -#ifdef USE_PERILLA - FillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f); - - void initFillPatch(int boxGrow, int time, int index, int scomp, int ncomp, int iter); - - void InitializePush (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f); - - void InitializePull (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f); - - void FillPatchPush (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT=false); - - void FillPatchPull (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void finalizeGraphs() - { - //std::cout << "Completing RGs "; - - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->finalizeGraph(); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->finalizeGraph(); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->finalizeGraph(); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->finalizeGraph(); - } - - //std::cout <<" by tg " << tg << std::endl; - } - - void Reset() - { - int tg= perilla::wid(); - //std::cout << "Resetting RGs "; - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->Reset(); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->Reset(); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->Reset(); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->Reset(); - } - //std::cout <<" by tg " << tg << std::endl; - } - - RegionGraph* get_destGraph(){return destGraph;} - RegionGraph* get_crscGraph(){return csrcGraph;} - RegionGraph* get_fsrcGraph(){return fsrcGraph;} -#endif - - - private: +private: // // Disallowed. // @@ -621,80 +504,6 @@ class FillPatchIterator std::vector< std::pair > m_range; MultiFab m_fabs; int m_ncomp; - -public: -#ifdef USE_PERILLA - RegionGraph* destGraph; - RegionGraph* csrcGraph; - RegionGraph* fsrcGraph; - RegionGraph* m_rg_crse_patch; - std::list regionList; - std::list mfList; - std::list stateDataList; - - - - MultiFab* m_mf_crse_patch; - const FabArrayBase::FPinfo* m_fpc; - MultiFab* dmf; - MultiFab* dmff; - Vector smf; - Geometry* geom; - StateDataPhysBCFunct* physbcf; - bool isProperlyNested; - Vector smf_crse; - Vector stime_crse; - StateDataPhysBCFunct* physbcf_crse; - Geometry* geom_crse; - Vector smf_fine; - Vector stime_fine; - StateDataPhysBCFunct* physbcf_fine; - Geometry* geom_fine; - - Vector stime; - void FillFromLevel0Push (Real time, int index, int scomp, int dcomp, int ncomp, int f); - void FillFromLevel0PushOnly (Real time, int index, int scomp, int dcomp, int ncomp, int f, bool singleT); - void FillFromLevel0Pull (Real time, int index, int scomp, int dcomp, int ncomp, int f, bool singleT); - void FillFromTwoLevelsPushOnly (Real time, int index, int scomp, int dcomp, int ncomp, int f, unsigned char pushLevel, bool singleT); - void FillFromTwoLevelsPush (Real time, int index, int scomp, int dcomp, int ncomp, int f, unsigned char pushLevel, bool singleT); - void FillFromTwoLevelsPull (Real time, int index, int scomp, int dcomp, int ncomp, int f, bool singleT); - void FillPatchTwoLevelsPush (Amr& amr, MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - FillPatchIterator* fpIter, - MultiFab *dmf, - MultiFab *dmff, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, unsigned char pushLevel, bool singleT); - - void FillPatchTwoLevelsPull (MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - FillPatchIterator* fpIter, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, bool singleT); - - void FillPatchSingleLevelPush (Amr& amr, MultiFab& mf, Real time, - Vector& smf, Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - MultiFab *dmf, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT); - void FillPatchSingleLevelPull (MultiFab& mf, Real time, - Vector& smf, Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT); - -#endif }; class FillPatchIteratorHelper @@ -754,329 +563,6 @@ private: std::map< int,Vector< Vector< Vector > > > m_fbid; // [grid][level][fillablesubbox][oldnew] }; - - //////////////////////////////////////Perilla///////////////////// -#ifdef USE_PERILLA -class AsyncFillPatchIterator - : - public MFIter -{ - public: - - friend class AmrLevel; - friend class RGIter; - - AsyncFillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter); - - void initFillPatch(int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iter); - - static void initialSend(amrex::Vector afpi, - amrex::Vector upper_afpi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter); - - void PushOnly (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT=false); - - void SendIntraLevel (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - bool singleT=false); - - void SendIntraLevel (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - bool singleT=false); - - void SendInterLevel (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - bool singleT=false); - - void SendInterLevel (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - bool singleT=false); - - void Receive (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void Receive (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void Receive (RGIter& rgi, - MultiFab& dest, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void Receive (RGIter* rgi, - MultiFab& dest, - int boxGrow, - const Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void PullOnly (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void PullOnly (MultiFab& dest, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - bool singleT=false); - - void FillPatchTwoLevelsPush (Amr& amr, MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - AsyncFillPatchIterator* fpIter, - MultiFab *dmf, - MultiFab *dmff, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, unsigned char pushLevel, bool singleT); - - void FillPatchTwoLevelsPull (MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - AsyncFillPatchIterator* fpIter, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, bool singleT); - - void FillPatchSingleLevelPush (Amr& amr, MultiFab& mf, Real time, - Vector& smf, const Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - MultiFab *dmf, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT); - void FillPatchSingleLevelPull (MultiFab& mf, Real time, - Vector& smf, const Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT); - - void FillFromTwoLevelsPush (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT); - void FillFromTwoLevelsPull (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - bool singleT); - - void FillFromTwoLevelsPull (MultiFab& dest, - Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - bool singleT); - - ~AsyncFillPatchIterator (); - - FArrayBox& operator() () { return m_fabs[MFIter::index()]; } - - Box UngrownBox () const { return MFIter::validbox(); } - - MultiFab& get_mf() { return m_fabs; } - - // protected: - // - // Disallowed. - // - AsyncFillPatchIterator (); - AsyncFillPatchIterator (const AsyncFillPatchIterator& rhs); - AsyncFillPatchIterator& operator= (const AsyncFillPatchIterator& rhs); - - // - // The data. - // - AmrLevel& m_amrlevel; - MultiFab& m_leveldata; - std::vector< std::pair > m_range; - MultiFab m_fabs; - int m_ncomp; - -public: - bool isProperlyNested; - - amrex::Vector smf; - amrex::Vector stime; - StateDataPhysBCFunct* physbcf; - Geometry* geom; - - - amrex::Vector smf_crse; - amrex::Vector stime_crse; - StateDataPhysBCFunct* physbcf_crse; - Geometry* geom_crse; - - amrex::Vector smf_fine; - amrex::Vector stime_fine; - StateDataPhysBCFunct* physbcf_fine; - Geometry* geom_fine; - - - RegionGraph* destGraph; - RegionGraph* csrcGraph; - RegionGraph* fsrcGraph; - - MultiFab* m_mf_crse_patch; - RegionGraph* m_rg_crse_patch; - const FabArrayBase::FPinfo* m_fpc; - - //PArray raii; - MultiFab* dmf; - MultiFab* dmff; - std::list regionList; - std::list mfList; - std::list stateDataList; - - - - void completeRegionGraphs() - { - //std::cout << "Completing RGs "; - - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->finalizeRegionGraph(); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->finalizeRegionGraph(); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->finalizeRegionGraph(); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->finalizeRegionGraph(); - } - //std::cout <<" by tg " << tg << std::endl; - } - - void Reset() - { - //std::cout << "Resetting RGs "; - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->Reset(); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->Reset(); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->Reset(); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->Reset(); - } - //std::cout <<" by tg " << tg << std::endl; - } - - // Variables for optimization calls of two level push/pulll - -}; -#endif - - } #endif /*_AmrLevel_H_*/ diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp index 20c43e7b27a..40c30dedd07 100644 --- a/Src/Amr/AMReX_AmrLevel.cpp +++ b/Src/Amr/AMReX_AmrLevel.cpp @@ -1,7 +1,6 @@ #include -#include #include #include @@ -21,14 +20,7 @@ #include #endif -#ifdef USE_PERILLA -#include -#endif - namespace amrex { -#ifdef USE_PERILLA -using namespace perilla; -#endif #ifdef AMREX_USE_EB int AmrLevel::m_eb_basic_grow_cells = 5; @@ -54,12 +46,12 @@ AmrLevel::postCoarseTimeStep (Real time) } void -AmrLevel::set_preferred_boundary_values (MultiFab& S, - int state_index, - int scomp, - int dcomp, - int ncomp, - Real time) const +AmrLevel::set_preferred_boundary_values (MultiFab& /*S*/, + int /*state_index*/, + int /*scomp*/, + int /*dcomp*/, + int /*ncomp*/, + Real /*time*/) const {} DeriveList& @@ -69,8 +61,8 @@ AmrLevel::get_derive_lst () noexcept } void -AmrLevel::manual_tags_placement (TagBoxArray& tags, - const Vector& bf_lev) +AmrLevel::manual_tags_placement (TagBoxArray& /*tags*/, + const Vector& /*bf_lev*/) {} AmrLevel::AmrLevel () noexcept @@ -113,14 +105,17 @@ AmrLevel::AmrLevel (Amr& papa, state.resize(desc_lst.size()); #ifdef AMREX_USE_EB - m_factory = makeEBFabFactory(geom, ba, dm, - {m_eb_basic_grow_cells, - m_eb_volume_grow_cells, - m_eb_full_grow_cells}, - m_eb_support_level); -#else - m_factory.reset(new FArrayBoxFactory()); + if (EB2::TopIndexSpaceIfPresent()) { + m_factory = makeEBFabFactory(geom, ba, dm, + {m_eb_basic_grow_cells, + m_eb_volume_grow_cells, + m_eb_full_grow_cells}, + m_eb_support_level); + } else #endif + { + m_factory.reset(new FArrayBoxFactory()); + } // Note that this creates a distribution map associated with grids. for (int i = 0; i < state.size(); i++) @@ -184,7 +179,9 @@ AmrLevel::writePlotFile (const std::string& dir, int n_data_items = plot_var_map.size() + derive_names.size(); #ifdef AMREX_USE_EB - n_data_items += 1; + if (EB2::TopIndexSpaceIfPresent()) { + n_data_items += 1; + } #endif // get the time from the first State_Type @@ -219,7 +216,9 @@ AmrLevel::writePlotFile (const std::string& dir, } #ifdef AMREX_USE_EB - os << "vfrac\n"; + if (EB2::TopIndexSpaceIfPresent()) { + os << "vfrac\n"; + } #endif os << AMREX_SPACEDIM << '\n'; @@ -304,10 +303,12 @@ AmrLevel::writePlotFile (const std::string& dir, } #ifdef AMREX_USE_EB - // volfrac threshhold for amrvis - if (level == parent->finestLevel()) { - for (int lev = 0; lev <= parent->finestLevel(); ++lev) { - os << "1.0e-6\n"; + if (EB2::TopIndexSpaceIfPresent()) { + // volfrac threshhold for amrvis + if (level == parent->finestLevel()) { + for (int lev = 0; lev <= parent->finestLevel(); ++lev) { + os << "1.0e-6\n"; + } } } #endif @@ -342,9 +343,11 @@ AmrLevel::writePlotFile (const std::string& dir, } #ifdef AMREX_USE_EB - plotMF.setVal(0.0, cnt, 1, nGrow); - auto factory = static_cast(m_factory.get()); - MultiFab::Copy(plotMF,factory->getVolFrac(),0,cnt,1,nGrow); + if (EB2::TopIndexSpaceIfPresent()) { + plotMF.setVal(0.0, cnt, 1, nGrow); + auto factory = static_cast(m_factory.get()); + MultiFab::Copy(plotMF,factory->getVolFrac(),0,cnt,1,nGrow); + } #endif // @@ -363,15 +366,15 @@ AmrLevel::writePlotFile (const std::string& dir, void -AmrLevel::writePlotFilePre (const std::string& dir, - std::ostream& os) +AmrLevel::writePlotFilePre (const std::string& /*dir*/, + std::ostream& /*os*/) { } void -AmrLevel::writePlotFilePost (const std::string& dir, - std::ostream& os) +AmrLevel::writePlotFilePost (const std::string& /*dir*/, + std::ostream& /*os*/) { } @@ -425,12 +428,17 @@ AmrLevel::restart (Amr& papa, parent->SetDistributionMap(level, dmap); #ifdef AMREX_USE_EB - m_factory = makeEBFabFactory(geom, grids, dmap, - {m_eb_basic_grow_cells, m_eb_volume_grow_cells, m_eb_full_grow_cells}, - m_eb_support_level); -#else - m_factory.reset(new FArrayBoxFactory()); + if (EB2::TopIndexSpaceIfPresent()) { + m_factory = makeEBFabFactory(geom, grids, dmap, + {m_eb_basic_grow_cells, + m_eb_volume_grow_cells, + m_eb_full_grow_cells}, + m_eb_support_level); + } else #endif + { + m_factory.reset(new FArrayBoxFactory()); + } state.resize(ndesc); for (int i = 0; i < ndesc; ++i) @@ -449,7 +457,7 @@ AmrLevel::restart (Amr& papa, } void -AmrLevel::set_state_in_checkpoint (Vector& state_in_checkpoint) +AmrLevel::set_state_in_checkpoint (Vector& /*state_in_checkpoint*/) { amrex::Error("Class derived AmrLevel has to handle this!"); } @@ -538,16 +546,16 @@ AmrLevel::checkPoint (const std::string& dir, void -AmrLevel::checkPointPre (const std::string& dir, - std::ostream& os) +AmrLevel::checkPointPre (const std::string& /*dir*/, + std::ostream& /*os*/) { BL_PROFILE("AmrLevel::checkPointPre()"); } void -AmrLevel::checkPointPost (const std::string& dir, - std::ostream& os) +AmrLevel::checkPointPost (const std::string& /*dir*/, + std::ostream& /*os*/) { BL_PROFILE("AmrLevel::checkPointPost()"); } @@ -660,7 +668,9 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, m_amrlevel(amrlevel), m_leveldata(leveldata), m_ncomp(0) -{} +{ + MFIter::depth = 0; +} FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel, MultiFab& leveldata, @@ -701,6 +711,7 @@ FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, BL_ASSERT(AmrLevel::desc_lst[idx].inRange(scomp,ncomp)); BL_ASSERT(0 <= idx && idx < AmrLevel::desc_lst.size()); + MFIter::depth = 0; Initialize(boxGrow,time,idx,scomp,ncomp); #ifdef BL_USE_TEAM @@ -1012,7 +1023,9 @@ FillPatchIterator::Initialize (int boxGrow, } else { #ifdef AMREX_USE_EB - amrex::Abort("Grids must be properly nested for EB"); + if (EB2::TopIndexSpaceIfPresent()) { + amrex::Abort("Grids must be properly nested for EB"); + } #endif static bool first = true; @@ -1491,27 +1504,7 @@ FillPatchIteratorHelper::fill (FArrayBox& fab, FillPatchIteratorHelper::~FillPatchIteratorHelper () {} -FillPatchIterator::~FillPatchIterator () { -#ifdef USE_PERILLA - while(regionList.size()){ - RegionGraph* tmp= regionList.front(); - delete tmp; - regionList.pop_front(); - } - - while(mfList.size()){ - MultiFab *tmp= mfList.front(); - delete tmp; - mfList.pop_front(); - } - - while(stateDataList.size()){ - StateDataPhysBCFunct *tmp= stateDataList.front(); - delete tmp; - stateDataList.pop_front(); - } -#endif - } +FillPatchIterator::~FillPatchIterator () {} void AmrLevel::FillCoarsePatch (MultiFab& mf, @@ -1566,12 +1559,16 @@ AmrLevel::FillCoarsePatch (MultiFab& mf, crseBA.set(j,mapper->CoarseBox(bx, crse_ratio)); } + MultiFab crseMF; #ifdef AMREX_USE_EB - auto cfactory = makeEBFabFactory(cgeom, crseBA, mf_DM, {0,0,0}, EBSupport::basic); - MultiFab crseMF(crseBA,mf_DM,NComp,0,MFInfo(),*cfactory); -#else - MultiFab crseMF(crseBA,mf_DM,NComp,0); + if (EB2::TopIndexSpaceIfPresent()) { + auto cfactory = makeEBFabFactory(cgeom, crseBA, mf_DM, {0,0,0}, EBSupport::basic); + crseMF.define(crseBA,mf_DM,NComp,0,MFInfo(),*cfactory); + } else #endif + { + crseMF.define(crseBA,mf_DM,NComp,0); + } if ( level == 1 || amrex::ProperlyNested(crse_ratio, parent->blockingFactor(level), @@ -1830,7 +1827,8 @@ AmrLevel::derive (const std::string& name, Real time, MultiFab& mf, int dcomp) const Box& bx = mfi.growntilebox(); FArrayBox& derfab = mf[mfi]; FArrayBox const& datafab = srcMF[mfi]; - rec->derFuncFab()(bx, derfab, dcomp, ncomp, datafab, geom, time, rec->getBC(), level); + const int dncomp = rec->numDerive(); + rec->derFuncFab()(bx, derfab, dcomp, dncomp, datafab, geom, time, rec->getBC(), level); } } else @@ -2266,1104 +2264,5 @@ AmrLevel::CreateLevelDirectory (const std::string &dir) levelDirectoryCreated = true; } - -#ifdef USE_PERILLA - void FillPatchIterator::FillPatchSingleLevelPush (Amr& amr, MultiFab& mf, Real time, - Vector& smf, Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - MultiFab *dmf, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT) - { - - BL_PROFILE("FillPatchSingleLevel"); - - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - - if (smf.size() == 1) - { - //mf.copy(smf[0], scomp, dcomp, ncomp, 0, mf.nGrow(), geom.periodicity()); - Perilla::multifabCopyPushAsync( destGraph, srcGraph, &mf, smf[0], f, dcomp, scomp, ncomp, mf.nGrow(), 0, singleT); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - //PArray raii(PArrayManage); - //MultiFab * dmf; - int destcomp; - bool sameba; - if (mf.boxArray() == smf[0]->boxArray()) - { - //dmf = &mf; - destcomp = dcomp; - sameba = true; - - int fis = smf[0]->IndexArray()[f]; - int fid = mf.IndexArray()[f]; - const Box& bx = mf[fid].box(); - mf[fid].linInterp - ((*smf[0])[fis], - scomp, - (*smf[1])[fis], - scomp, - stime[0], - stime[1], - time, - bx, - destcomp, - ncomp); - Perilla::fillBoundaryPush(destGraph, &mf, f); - } - else - { - - //dmf = raii.push_back(new MultiFab(smf[0].boxArray(), ncomp, 0)); - //MultiFab dmf(smf[0].boxArray(), ncomp, 0); - destcomp = 0; - sameba = false; - - assert(smf[0]); - assert(smf[0]->IndexArray().size()>f); - assert(dmf); - assert(dmf->IndexArray().size()>f); - int fis = smf[0]->IndexArray()[f]; - int fid = dmf->IndexArray()[f]; - - for(int t=0; tfabTiles[f]->numTiles; t++) - if( singleT || t % (perilla::NUM_THREADS_PER_TEAM-1) == nt) - { - const Box& bx = *(srcGraph->fabTiles[f]->tileBx[t]); - - //const Box& bx = (*dmf)[fid].box(); - (*dmf)[fid].linInterp - ((*smf[0])[fis], - scomp, - (*smf[1])[fis], - scomp, - stime[0], - stime[1], - time, - bx, - destcomp, - ncomp); - } - if(!singleT) - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - int src_ngrow = 0; - int dst_ngrow = mf.nGrow(); - Perilla::multifabCopyPushAsync( destGraph, srcGraph, &mf, dmf, f, dcomp, 0, ncomp, mf.nGrow(), 0, singleT); - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - } - - - void FillPatchIterator::FillPatchTwoLevelsPush (Amr& amr, MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - FillPatchIterator* fpIter, - MultiFab *dmf, - MultiFab *dmff, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, unsigned char pushLevel, bool singleT) - { - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - if(f>=0){//fill only this fab - if(pushLevel & 0x01 ) - { - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - - if (!fpIter->m_fpc->ba_crse_patch.empty()) - { - FillPatchSingleLevelPush(amr, *(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, f, dmf, scomp, 0, ncomp, cgeom, cbc, singleT); - } - } - } - if((pushLevel & 0x02) && (pushLevel != 0x03)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, f, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - }else{ //fill the whole multifab - if(pushLevel & 0x01 && pushLevel & 0x02) - { - int tg = perilla::wid(); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - if(pushLevel & 0x04) - { - int tg = perilla::wid(); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - } - } - - void FillPatchIterator::FillPatchSingleLevelPull (MultiFab& mf, Real time, - Vector& smf, Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT) - { - - BL_PROFILE("FillPatchSingleLevel"); - - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - if (smf.size() == 1) - { - //mf.copy(smf[0], scomp, dcomp, ncomp, 0, mf.nGrow(), geom.periodicity()); - Perilla::multifabCopyPull( destGraph, srcGraph, &mf, smf[0], f, dcomp, scomp, ncomp, mf.nGrow(), 0, singleT); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - //Vector raii(PArrayManage); - MultiFab * dmf; - int destcomp; - bool sameba; - if (mf.boxArray() == smf[0]->boxArray()) { - dmf = &mf; - destcomp = dcomp; - sameba = true; - } else { - //dmf = srcGraph->assocMF; - destcomp = 0; - sameba = false; - } - if (sameba) - { - // Note that when sameba is true mf's BoxArray is nonoverlapping. - // So FillBoundary is safe. - //mf.FillBoundary(dcomp,ncomp,geom.periodicity()); - Perilla::fillBoundaryPull(destGraph, dmf, f, singleT); - } - else - { - int src_ngrow = 0; - int dst_ngrow = mf.nGrow(); - MultiFab* dummyMF; - //mf.copy(*dmf, 0, dcomp, ncomp, src_ngrow, dst_ngrow, geom.periodicity()); - Perilla::multifabCopyPull( destGraph, srcGraph, &mf, dummyMF, f, dcomp, 0, ncomp, mf.nGrow(), 0, singleT); - } - } - else { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } -#if 0 - physbcf.doit_fab(mf, f, dcomp, ncomp, time); -#endif - } - - void FillPatchIterator::FillPatchTwoLevelsPull (MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - FillPatchIterator* fpIter, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, bool singleT) - { - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - - if ( ! fpIter->m_fpc->ba_crse_patch.empty()) - { - - int idummy1=0, idummy2=0; - bool cc = fpIter->m_fpc->ba_crse_patch.ixType().cellCentered(); - { - int gi = mf.IndexArray()[f]; - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - FillPatchSingleLevelPull(*(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, li, scomp, 0, ncomp, cgeom, cbc, singleT); - } - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - int nt = WorkerThread::perilla_wtid(); - Box fdomain = fgeom.Domain(); - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - if(singleT) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Array bcr(ncomp); - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - else - { - if(!cc) - { - if(WorkerThread::perilla_isMasterWorkerThread()) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Box fdomain = fgeom.Domain(); - - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - - } - } - else - { - if(i % (perilla::NUM_THREADS_PER_TEAM-1) == nt-1) - { - - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - - } - } - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } - } - - - FillPatchSingleLevelPull(mf, time, fmf, ft, destGraph, fsrcGraph, f, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - -void FillPatchIterator::FillFromTwoLevelsPush (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT) -{ - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPush(*(m_amrlevel.parent), m_fabs, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - dmf, - dmff, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), pushLevel, singleT); -} - -void -FillPatchIterator::FillFromTwoLevelsPushOnly (Real time, int index, int scomp, int dcomp, int ncomp, int f, unsigned char pushLevel, bool singleT) -{ - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - //if(physbcf_fine == NULL && physbcf_crse == NULL) - //{ - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - //} - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPush(*(m_amrlevel.parent), m_fabs, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - dmf, - dmff, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), pushLevel, singleT); -} - -void FillPatchIterator::FillFromTwoLevelsPull (Real time, int index, int scomp, int dcomp, int ncomp, int f, bool singleT) -{ - - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPull(m_fabs, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), singleT); -} - -void -FillPatchIterator::FillPatchPush (int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT) -{ - BL_PROFILE("FillPatchIterator::InitializePush"); - - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - //const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - if(i>0) - amrex::Abort("**** Error in FillPatchIterator::Initialize: non contigeous components not implemented"); - - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - FillPatchSingleLevelPush (*(m_amrlevel.parent), m_fabs, time, smf, stime, destGraph, fsrcGraph, f, dmf, SComp, DComp, NComp, *geom, *physbcf, singleT); - } - else - { - if (level == 1 || isProperlyNested) - { - FillFromTwoLevelsPushOnly(time, index, SComp, DComp, NComp, f, pushLevel, singleT); - } else { - amrex::Abort("**** Error in FillPatchIterator::Initialize: !ProperlyNested not implemented"); - } - } - DComp += NComp; - } -} - -void -FillPatchIterator::FillPatchPull (int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) -{ - BL_PROFILE("FillPatchIterator::InitializePull"); - - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - //const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - if(i>0) - amrex::Abort("**** Error in FillPatchIterator::Initialize: non contigeous components not implemented"); - - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - FillPatchSingleLevelPull (m_fabs, time, smf, stime, destGraph, fsrcGraph, f, SComp, DComp, NComp, *geom, *physbcf, singleT); - } - else - { - if (level == 1 || isProperlyNested) - { - FillFromTwoLevelsPull(time, index, SComp, DComp, NComp, f, singleT); - } else { - amrex::Abort("**** Error in FillPatchIterator::Initialize: !ProperlyNested not implemented"); - } - } - //if(WorkerThread::isTeamMasterThread(tid)) - { - const MultiFab& mf_fillpatched = m_fabs; - - if(singleT) - { - for(int t=0; tfabTiles_gtbx[f]->numTiles; t++) - { - const Box& bx = *(destGraph->fabTiles_gtbx[f]->tileBx[t]); - MultiFab::Copy(m_leveldata, mf_fillpatched, f, 0, DComp, ncomp, bx); - } - } - else - { - perilla::syncAllWorkerThreads(); - int nt = WorkerThread::perilla_wtid(); - for(int t=0; tfabTiles_gtbx[f]->numTiles; t++) - if(t % (perilla::NUM_THREADS_PER_TEAM-1) == nt-1) - { - const Box& bx = *(destGraph->fabTiles_gtbx[f]->tileBx[t]); - MultiFab::Copy(m_leveldata, mf_fillpatched, f, 0, DComp, ncomp, bx); - } - perilla::syncAllWorkerThreads(); - } - } - DComp += NComp; - } - // - // Call hack to touch up fillPatched data. - // - /*m_amrlevel.set_preferred_boundary_values(m_fabs, - index, - scomp, - 0, - ncomp, - time);*/ - - } - -void -FillPatchIterator::initFillPatch(int boxGrow, int time, int index, int scomp, int ncomp, int iter) -{ -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - m_ncomp = ncomp; - m_range = desc.sameInterps(scomp,ncomp); - - m_fabs.define(m_leveldata.boxArray(),m_leveldata.DistributionMap(), m_ncomp,boxGrow); - - BL_ASSERT(m_leveldata.DistributionMap() == m_fabs.DistributionMap()); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - - const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - int dcomp = DComp; - if (level == 0) - { -#if 1 -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - - { - BL_ASSERT(m_amrlevel.level == 0); - StateData& statedata = m_amrlevel.state[index]; - statedata.getData(smf,stime,time); - geom = &m_amrlevel.geom; - physbcf = new StateDataPhysBCFunct(statedata,scomp,*geom); - stateDataList.push_back(physbcf); - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_fabs.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - if (smf.size() == 1) - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(smf[0]->IndexArray().size()); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - -#if 1 - Perilla::multifabExtractCopyAssoc( destGraph, fsrcGraph, m_fabs, *(smf[0]), (const int) ncomp, m_fabs.nGrow(), 0, geom->periodicity()); -#endif -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - - if (m_fabs.boxArray() == smf[0]->boxArray()) - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - dmf = &m_fabs; - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - regionList.push_back(destGraph); - } -#ifdef USE_PERILLA_PTHREADS - // perilla::syncAllThreads(); -#endif - Perilla::multifabBuildFabCon(destGraph, m_fabs, geom->periodicity()); -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(destGraph); - } - } - else - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - dmf = new MultiFab(smf[0]->boxArray(), smf[0]->DistributionMap(), ncomp, 0); - //dmf->initVal(); // for Perilla NUMA - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(dmf->IndexArray().size()); - fsrcGraph->buildTileArray(*dmf); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - mfList.push_back(dmf); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - - Perilla::multifabExtractCopyAssoc(destGraph, fsrcGraph, m_fabs, *dmf, ncomp, m_fabs.nGrow(), 0, geom->periodicity()); - -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - } - } - else - { - //BoxLib::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } -#endif - //-------------------------------------------------- FillFromLevel0 initialization completed - } - else - { -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - isProperlyNested = amrex::ProperlyNested(m_amrlevel.crse_ratio, - m_amrlevel.parent->blockingFactor(m_amrlevel.level), - boxGrow, boxType, desc.interp(SComp)); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - if (level == 1 || isProperlyNested) - { - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - BL_ASSERT(ilev_crse >= 0); - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - geom_fine = &fine_level.geom; - geom_crse = &crse_level.geom; - } - StateData& statedata_crse = crse_level.state[index]; - StateData& statedata_fine = fine_level.state[index]; -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - statedata_crse.getData(smf_crse,stime_crse,time); - statedata_fine.getData(smf_fine,stime_fine,time); - physbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - physbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - stateDataList.push_back(physbcf_crse); - stateDataList.push_back(physbcf_fine); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - int ngrow = m_fabs.nGrow(); - if (ngrow > 0 || m_fabs.getBDKey() != smf_fine[0]->getBDKey()) - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - InterpolaterBoxCoarsener coarsener = desc.interp(scomp)->BoxCoarsener(crse_level.fineRatio()); - Box fdomain = geom_fine->Domain(); - fdomain.convert(m_fabs.boxArray().ixType()); - Box fdomain_g(fdomain); - for (int i = 0; i < BL_SPACEDIM; ++i) { - if (geom_fine->isPeriodic(i)) { - fdomain_g.grow(i,ngrow); - } - } - Box c_dom= amrex::coarsen(geom_fine->Domain(), m_amrlevel.crse_ratio); - m_fpc = &FabArrayBase::TheFPinfo(*(smf_fine[0]), m_fabs, fdomain_g, IntVect(ngrow), coarsener, c_dom, NULL); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - if (!m_fpc->ba_crse_patch.empty()) - { -#ifdef USE_PERILLA_PTHREADS - if(perilla::isMasterThread()) -#endif - { - m_mf_crse_patch = new MultiFab(m_fpc->ba_crse_patch, m_fpc->dm_crse_patch, ncomp, 0); - mfList.push_back(m_mf_crse_patch); - //m_mf_crse_patch->initVal(); // for Perilla NUMA - BL_ASSERT(scomp+ncomp <= smf_crse[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_mf_crse_patch->nComp()); - BL_ASSERT(smf_crse.size() == stime_crse.size()); - BL_ASSERT(smf_crse.size() != 0); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - if (iter == 1) - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - m_rg_crse_patch->isDepGraph = true; - csrcGraph = new RegionGraph(smf_crse[0]->IndexArray().size()); - regionList.push_back(m_rg_crse_patch); - regionList.push_back(csrcGraph); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif -#if 1 - Perilla::multifabExtractCopyAssoc( m_rg_crse_patch, csrcGraph, *m_mf_crse_patch, *(smf_crse[0]), ncomp, m_mf_crse_patch->nGrow(), 0,geom_crse->periodicity()); -#endif -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - m_amrlevel.parent->graphArray[level].push_back(csrcGraph); - } - } - else if (iter > 1) - { -#if 1 - //BL_ASSERT(smf_crse[0].boxArray() == smf_crse[1].boxArray()); - //PArray raii(PArrayManage); - //MultiFab * dmf; - - if (m_mf_crse_patch->boxArray() == smf_crse[0]->boxArray()) - { - //dmf = m_mf_crse_patch; - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - - //std::cout<< " level " << level << " rg_crs_ptch ID " << m_rg_crse_patch->graphID << std::endl; - - Perilla::multifabBuildFabCon(m_rg_crse_patch, *m_mf_crse_patch, geom->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - regionList.push_back(m_rg_crse_patch); - } - else - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - //dmf = raii.push_back(new MultiFab(smf_crse[0].boxArray(), ncomp, 0)); - dmf = new MultiFab(smf_crse[0]->boxArray(), smf_crse[0]->DistributionMap(), ncomp, 0); - //dmf->initVal(); // for Perilla NUMA - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - m_rg_crse_patch->isDepGraph = true; - csrcGraph = new RegionGraph(dmf->IndexArray().size()); - csrcGraph->buildTileArray(*dmf); - regionList.push_back(m_rg_crse_patch); - regionList.push_back(csrcGraph); - mfList.push_back(dmf); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - -#if 1 - Perilla::multifabExtractCopyAssoc( m_rg_crse_patch, csrcGraph, *m_mf_crse_patch, *dmf, ncomp, m_mf_crse_patch->nGrow(), 0, geom_crse->periodicity()); -#endif -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - m_amrlevel.parent->graphArray[level].push_back(csrcGraph); - } - } -#endif - } - else - { - // BoxLib::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - } - } -#if 1 - BL_ASSERT(scomp+ncomp <= smf_fine[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_fabs.nComp()); - BL_ASSERT(smf_fine.size() == stime_fine.size()); - BL_ASSERT(smf_fine.size() != 0); - - if(true) // it will always be the case because same level comm and time will be available - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - dmff = new MultiFab(smf_fine[0]->boxArray(), smf_fine[0]->DistributionMap(), ncomp, 0); - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - //fsrcGraph = new RegionGraph(smf_fine[0]->IndexArray().size()); - fsrcGraph = new RegionGraph(dmff->IndexArray().size()); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - mfList.push_back(dmff); - - if(m_rg_crse_patch != 0) - { - destGraph->srcLinkGraph = m_rg_crse_patch; - //for(int lfi=0; lfi < destGraph->numTasks; lfi++ ) - { - for (MFIter mfi(*(m_mf_crse_patch),false); mfi.isValid(); ++mfi) - { - int li = mfi.LocalIndex(); - int gi = m_fpc->dst_idxs[li]; - //if(gi == m_mf_crse_patch->IndexArray()[li]) - { - int lfi = m_fabs.localindex(gi); - destGraph->task[lfi]->depTasksCompleted = false; - destGraph->task[lfi]->depTaskIDs.push_back(li); - } - } - } - } - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - - //if(level == 2) - //std::cout<< "Sending In "<periodicity()); -#endif -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - } - else if (smf_fine.size() == 2) - { - if (m_fabs.boxArray() == smf_fine[0]->boxArray()) - { - //dmf = &m_fabs; - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - Perilla::multifabBuildFabCon(destGraph, m_fabs, geom->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - regionList.push_back(destGraph); - } - else - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - //dmf = raii.push_back(new MultiFab(smf_fine[0].boxArray(), ncomp, 0)); - dmff = new MultiFab(smf_fine[0]->boxArray(), smf_fine[0]->DistributionMap(), ncomp, 0); - //dmff->initVal(); // for Perilla NUMA - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(dmff->IndexArray().size()); - fsrcGraph->buildTileArray(*dmff); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - mfList.push_back(dmff); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif -#if 1 - Perilla::multifabExtractCopyAssoc( destGraph, fsrcGraph, m_fabs, *dmff, ncomp, m_fabs.nGrow(), 0, geom_fine->periodicity()); -#endif -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } -#endif - //-------------------- FillFromTwoLevels initialization completed - } // if(level==1 OR ProperlyNested) - else - { - amrex::Abort("initFillPatch: level is not properly nested"); - } - } - - DComp += NComp; - } -#if 1 -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -// if(perilla::isMasterThread()) -#endif - { - destGraph->buildTileArray(m_fabs); - destGraph->buildTileArray_gtbx(m_leveldata,boxGrow); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif -#endif -} - -FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f) - : - MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_ncomp(ncomp), - physbcf(0), - physbcf_crse(0), - physbcf_fine(0), - destGraph(0), - fsrcGraph(0), - csrcGraph(0), - m_rg_crse_patch(NULL), - //raii(PArrayManage) - dmf(NULL), - dmff(NULL) -{ -#if 1 - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(AmrLevel::desc_lst[index].inRange(scomp,ncomp)); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - //InitializePush(boxGrow,time,index,scomp,ncomp,f,tid); - -#ifdef BL_USE_TEAM - ParallelDescriptor::MyTeam().MemoryBarrier(); -#endif - -#endif -} - -//end USE_PERILLA -#endif - } diff --git a/Src/Amr/AMReX_AsyncFillPatch.cpp b/Src/Amr/AMReX_AsyncFillPatch.cpp deleted file mode 100644 index 0f99ab43fa2..00000000000 --- a/Src/Amr/AMReX_AsyncFillPatch.cpp +++ /dev/null @@ -1,1486 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef USE_PERILLA -#include - -namespace amrex { - using namespace perilla; - - AsyncFillPatchIterator::AsyncFillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iter) - : - MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_ncomp(ncomp), - physbcf(0), - physbcf_crse(0), - physbcf_fine(0), - destGraph(0), - fsrcGraph(0), - csrcGraph(0), - m_rg_crse_patch(0), - dmf(0), - dmff(0) - { - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(AmrLevel::desc_lst[index].inRange(scomp,ncomp)); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - initFillPatch(boxGrow, time, index, scomp, ncomp, iter); - -#ifdef BL_USE_TEAM - ParallelDescriptor::MyTeam().MemoryBarrier(); -#endif - } - - -AsyncFillPatchIterator::~AsyncFillPatchIterator () { -#ifdef USE_PERILLA - while(regionList.size()){ - RegionGraph* tmp= regionList.front(); - delete tmp; - regionList.pop_front(); - } - - while(mfList.size()){ - MultiFab *tmp= mfList.front(); - delete tmp; - mfList.pop_front(); - } - - while(stateDataList.size()){ - StateDataPhysBCFunct *tmp= stateDataList.front(); - delete tmp; - stateDataList.pop_front(); - } -#endif - } - - - void AsyncFillPatchIterator::FillFromTwoLevelsPush (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT) - { - - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPush(*(m_amrlevel.parent), m_fabs, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - dmf, - dmff, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), pushLevel,singleT); - } - - void AsyncFillPatchIterator::FillFromTwoLevelsPull (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - bool singleT) - { - - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPull(m_fabs, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), singleT); - } - - void AsyncFillPatchIterator::FillFromTwoLevelsPull (MultiFab& dest, - Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - bool singleT) - { - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - Geometry* tgeom_fine = &fine_level.geom; - Geometry* tgeom_crse = &crse_level.geom; - - Vector tsmf_crse; - Vector tsmf_fine; - Vector tstime_crse; - Vector tstime_fine; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(tsmf_crse,tstime_crse,time); - StateDataPhysBCFunct* tphysbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(tsmf_fine,tstime_fine,time); - StateDataPhysBCFunct* tphysbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - FillPatchTwoLevelsPull(dest, time, - tsmf_crse, tstime_crse, - tsmf_fine, tstime_fine, - destGraph, csrcGraph, fsrcGraph, f, - this, - scomp, dcomp, ncomp, - *tgeom_crse, *tgeom_fine, - *tphysbcf_crse, *tphysbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs(), singleT); - } - - void AsyncFillPatchIterator::initFillPatch(int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iter) - { - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - int myProc = amrex::ParallelDescriptor::MyProc(); - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - - m_ncomp = ncomp; - m_range = desc.sameInterps(scomp,ncomp); - - m_fabs.define(m_leveldata.boxArray(),m_leveldata.DistributionMap(), - m_ncomp,boxGrow); - - BL_ASSERT(m_leveldata.DistributionMap() == m_fabs.DistributionMap()); - - const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - int dcomp = DComp; - - if (level == 0) - { - BL_ASSERT(m_amrlevel.level == 0); - StateData& statedata = m_amrlevel.state[index]; - statedata.getData(smf,stime,time); - geom = &m_amrlevel.geom; - physbcf = new StateDataPhysBCFunct(statedata,scomp,*geom); - stateDataList.push_back(physbcf); - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_fabs.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - if (smf.size() == 1) - { - dmf = new MultiFab(smf[0]->boxArray(), smf[0]->DistributionMap(), ncomp, 0); - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(smf[0]->IndexArray().size()); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - Perilla::multifabExtractCopyAssoc( destGraph, fsrcGraph, m_fabs, *smf[0], ncomp, m_fabs.nGrow(), 0, geom->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - if (m_fabs.boxArray() == smf[0]->boxArray()) - { - dmf = &m_fabs; - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - regionList.push_back(destGraph); - Perilla::multifabBuildFabCon(destGraph, m_fabs, geom->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - } - else - { - dmf = new MultiFab(smf[0]->boxArray(), smf[0]->DistributionMap(), ncomp, 0); - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(dmf->IndexArray().size()); - fsrcGraph->buildTileArray(*dmf); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - mfList.push_back(dmf); - - Perilla::multifabExtractCopyAssoc( destGraph, fsrcGraph, m_fabs, *dmf, ncomp, m_fabs.nGrow(), 0, geom->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - //-------------------------------------------------- FillFromLevel0 initialization completed - } - else - { - isProperlyNested = amrex::ProperlyNested(m_amrlevel.crse_ratio, - m_amrlevel.parent->blockingFactor(m_amrlevel.level), - boxGrow, boxType, desc.interp(SComp)); - if (level == 1 || isProperlyNested) - { - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - BL_ASSERT(ilev_crse >= 0); - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - geom_fine = &fine_level.geom; - geom_crse = &crse_level.geom; - StateData& statedata_crse = crse_level.state[index]; - statedata_crse.getData(smf_crse,stime_crse,time); - physbcf_crse = new StateDataPhysBCFunct(statedata_crse,scomp,*geom_crse); - StateData& statedata_fine = fine_level.state[index]; - statedata_fine.getData(smf_fine,stime_fine,time); - physbcf_fine = new StateDataPhysBCFunct(statedata_fine,scomp,*geom_fine); - - stateDataList.push_back(physbcf_crse); - stateDataList.push_back(physbcf_fine); - - const StateDescriptor& desc = AmrLevel::desc_lst[index]; - int ngrow = m_fabs.nGrow(); - if (ngrow > 0 || m_fabs.getBDKey() != smf_fine[0]->getBDKey()) - { - InterpolaterBoxCoarsener coarsener = desc.interp(scomp)->BoxCoarsener(crse_level.fineRatio()); - Box fdomain = geom_fine->Domain(); - fdomain.convert(m_fabs.boxArray().ixType()); - Box fdomain_g(fdomain); - for (int i = 0; i < BL_SPACEDIM; ++i) { - if (geom_fine->isPeriodic(i)) { - fdomain_g.grow(i,ngrow); - } - } - // dummytostopcraycompilererror - std::cout << ""; - - Box c_dom= amrex::coarsen(geom_fine->Domain(), m_amrlevel.crse_ratio); - - m_fpc = &FabArrayBase::TheFPinfo(*smf_fine[0], m_fabs, fdomain_g, IntVect(ngrow), coarsener, c_dom, NULL); - - if (!m_fpc->ba_crse_patch.empty()) - { - m_mf_crse_patch = new MultiFab(m_fpc->ba_crse_patch, m_fpc->dm_crse_patch, ncomp, 0); - mfList.push_back(m_mf_crse_patch); - BL_ASSERT(scomp+ncomp <= smf_crse[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_mf_crse_patch->nComp()); - BL_ASSERT(smf_crse.size() == stime_crse.size()); - BL_ASSERT(smf_crse.size() != 0); - - //if (smf_crse.size() == 1) - if (iter == 1) - { - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - - m_rg_crse_patch->isDepGraph = true; - - csrcGraph = new RegionGraph(smf_crse[0]->IndexArray().size()); - - regionList.push_back(m_rg_crse_patch); - regionList.push_back(csrcGraph); - - Perilla::multifabExtractCopyAssoc( m_rg_crse_patch, csrcGraph, *m_mf_crse_patch, *smf_crse[0], ncomp, m_mf_crse_patch->nGrow(), 0,geom_crse->periodicity()); -#if 0 - MultiFab temp_4_tile(m_fpc->ba_dst_boxes, m_fpc->dm_crse_patch, ncomp, 0); - m_rg_crse_patch->buildTileArray(temp_4_tile); -#endif - ///m_rg_crse_patch->buildTileArray(*m_mf_crse_patch); - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - m_amrlevel.parent->graphArray[level].push_back(csrcGraph); - } - - else if (iter > 1) - { - if (m_mf_crse_patch->boxArray() == smf_crse[0]->boxArray()) - { - //dmf = m_mf_crse_patch; - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - - //std::cout<< " level " << level << " rg_crs_ptch ID " << m_rg_crse_patch->graphID << std::endl; - - Perilla::multifabBuildFabCon(m_rg_crse_patch, *m_mf_crse_patch,geom_crse->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - regionList.push_back(m_rg_crse_patch); - } - else - { - dmf = new MultiFab(smf_crse[0]->boxArray(), smf_crse[0]->DistributionMap(), ncomp, 0); - m_rg_crse_patch = new RegionGraph(m_mf_crse_patch->IndexArray().size()); - m_rg_crse_patch->isDepGraph = true; - csrcGraph = new RegionGraph(dmf->IndexArray().size()); - csrcGraph->buildTileArray(*dmf); - -#if 0 - MultiFab temp_4_tile(m_fpc->ba_dst_boxes, m_fpc->dm_crse_patch, ncomp, 0); - m_rg_crse_patch->buildTileArray(temp_4_tile); -#endif - - regionList.push_back(m_rg_crse_patch); - regionList.push_back(csrcGraph); - mfList.push_back(dmf); - - - Perilla::multifabExtractCopyAssoc( m_rg_crse_patch, csrcGraph, *m_mf_crse_patch, *dmf, ncomp, m_mf_crse_patch->nGrow(), 0, geom_crse->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(m_rg_crse_patch); - m_amrlevel.parent->graphArray[level].push_back(csrcGraph); - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - - } - } - - BL_ASSERT(scomp+ncomp <= smf_fine[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= m_fabs.nComp()); - BL_ASSERT(smf_fine.size() == stime_fine.size()); - BL_ASSERT(smf_fine.size() != 0); - - //if (smf_fine.size() == 1) // probabily it should aways be this because same level - //if (iter == 1) - if(true) // it will always be the case because same level comm and time will be available - { - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(smf_fine[0]->IndexArray().size()); - - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - - - if(m_rg_crse_patch != 0) - { - destGraph->srcLinkGraph = m_rg_crse_patch; - //for(int lfi=0; lfi < destGraph->numTasks; lfi++ ) - - //std::cout << " m_mf_crse_patch->IndexArray().size() " << m_mf_crse_patch->IndexArray().size() << " size " << m_mf_crse_patch->size() << " myP " << myProc<< std::endl; - - { - for (MFIter mfi(*(m_mf_crse_patch),false); mfi.isValid(); ++mfi) - { - int li = mfi.LocalIndex(); - int gi = m_fpc->dst_idxs[li]; - //if(gi == m_mf_crse_patch->IndexArray()[li]) - { - int lfi = m_fabs.localindex(gi); - destGraph->task[lfi]->depTasksCompleted = false; - destGraph->task[lfi]->depTaskIDs.push_back(li); - - } - } - } - } - //if(level == 2) - //std::cout<< "Sending In "<periodicity()); - - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - else if (smf_fine.size() == 2) - //else if (iter > 1) - { - - //BL_ASSERT(smf_fine[0]->boxArray() == smf_fine[1]->boxArray()); - //PArray raii(PArrayManage); - //MultiFab * dmf; - - if (m_fabs.boxArray() == smf_fine[0]->boxArray()) - { - //dmf = &m_fabs; - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - - Perilla::multifabBuildFabCon(destGraph, m_fabs, geom_fine->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - regionList.push_back(destGraph); - } - else - { - //dmf = raii.push_back(new MultiFab(smf_fine[0]->boxArray(), m_amrlevel.dmap, ncomp, 0)); - dmff = new MultiFab(smf_fine[0]->boxArray(), smf_fine[0]->DistributionMap(), ncomp, 0); - //dmff->initVal(); // for Perilla NUMA - destGraph = new RegionGraph(m_fabs.IndexArray().size()); - fsrcGraph = new RegionGraph(dmff->IndexArray().size()); - fsrcGraph->buildTileArray(*dmff); - regionList.push_back(destGraph); - regionList.push_back(fsrcGraph); - mfList.push_back(dmff); - - Perilla::multifabExtractCopyAssoc( destGraph, fsrcGraph, m_fabs, *dmff, ncomp, m_fabs.nGrow(), 0, geom_fine->periodicity()); - m_amrlevel.parent->graphArray[level].push_back(destGraph); - m_amrlevel.parent->graphArray[level].push_back(fsrcGraph); - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - //-------------------- FillFromTwoLevels initialization completed - - } // if(level==1 OR ProperlyNested) - else - { - amrex::Abort("initFillPatch: level is not properly nested"); - } - } - DComp += NComp; - } - - destGraph->buildTileArray(m_fabs); - destGraph->buildTileArray_gtbx(m_leveldata,boxGrow); - //MemOpt - //m_fabs.clear(); - } - - void AsyncFillPatchIterator::SendIntraLevel (RGIter& rgi, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iteration, - int f, - bool singleT) - { - if(rgi.currentItr != rgi.totalItr) - return; - - const int level = m_amrlevel.level; - - - - int ncycle = m_amrlevel.parent->nCycle(level); - unsigned char pushLevel = 0x02; - PushOnly(boxGrow, time, index, scomp, ncomp, f, pushLevel, singleT); - } - void AsyncFillPatchIterator::SendInterLevel (RGIter* rgi, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iteration, - int f, - bool singleT) - { - if(rgi->currentItr != rgi->totalItr) - return; - - if(m_amrlevel.level-1 < m_amrlevel.parent->finestLevel()) - { - unsigned char tuc = 0x01; - PushOnly(boxGrow, time+((iteration-1)*m_amrlevel.parent->dtLevel(m_amrlevel.level)), index, scomp, ncomp, f, tuc, singleT); - } - } - - void AsyncFillPatchIterator::SendInterLevel (RGIter& rgi, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int iteration, - int f, - bool singleT) - { - SendInterLevel(&rgi, boxGrow, time, index, scomp, ncomp, iteration, f, singleT); - } - - - void AsyncFillPatchIterator::PushOnly (int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - unsigned char pushLevel, - bool singleT) - { - BL_PROFILE("FillPatchIterator::InitializePush"); - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - //const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - int myProc = amrex::ParallelDescriptor::MyProc(); - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - if(i>0) - amrex::Abort("**** Error in FillPatchIterator::Initialize: non contigeous components not implemented"); - - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - Vector tsmf; - Vector tstime; - StateData& statedata = m_amrlevel.state[index]; - statedata.getData(tsmf,tstime,time); - FillPatchSingleLevelPush (*(m_amrlevel.parent), m_fabs, time, tsmf, tstime, destGraph, fsrcGraph, f, dmf, SComp, DComp, NComp, *geom, *physbcf, singleT); - }else{ - if (level == 1 || isProperlyNested) - { - FillFromTwoLevelsPush(time, index, SComp, DComp, NComp, f, pushLevel, singleT); - }else { - amrex::Abort("**** Error in FillPatchIterator::Initialize: !ProperlyNested not implemented"); - } - } - DComp += NComp; - } - } - - void AsyncFillPatchIterator::PullOnly (int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) - { - BL_PROFILE("FillPatchIterator::InitializePull"); - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - //const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - if(i>0) - amrex::Abort("**** Error in FillPatchIterator::Initialize: non contigeous components not implemented"); - - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - FillPatchSingleLevelPull (m_fabs, time, smf, stime, destGraph, fsrcGraph, f, SComp, DComp, NComp, *geom, *physbcf, singleT); - } - else - { - if (level == 1 || isProperlyNested) - { - FillFromTwoLevelsPull(time, index, SComp, DComp, NComp, f, singleT); - } else { - amrex::Abort("**** Error in FillPatchIterator::Initialize: !ProperlyNested not implemented"); - } - } - //if(WorkerThread::isTeamMasterThread(tid)) - { - const MultiFab& mf_fillpatched = m_fabs; - if(singleT) - { - for(int t=0; tfabTiles_gtbx[f]->numTiles; t++) - { - const Box& bx = *(destGraph->fabTiles_gtbx[f]->tileBx[t]); - MultiFab::Copy(m_leveldata, mf_fillpatched, f, 0, DComp, ncomp, bx); - } - } - else - { - int nt = perilla::wtid(); - int totalCompThreads= perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS; - for(int t=nt; tfabTiles_gtbx[f]->numTiles; t+= totalCompThreads) - { - const Box& bx = *(destGraph->fabTiles_gtbx[f]->tileBx[t]); - MultiFab::Copy(m_leveldata, mf_fillpatched, f, 0, DComp, ncomp, bx); - } - } - } - DComp += NComp; - } - } - - -void -AsyncFillPatchIterator::PullOnly (MultiFab& dest, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) -{ - BL_PROFILE("FillPatchIterator::InitializePull"); - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= index && index < AmrLevel::desc_lst.size()); - - int myProc = amrex::ParallelDescriptor::MyProc(); - - //const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < m_range.size(); i++) - { - if(i>0) - amrex::Abort("**** Error in FillPatchIterator::Initialize: non contigeous components not implemented"); - - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - - //double start_time_wtime = omp_get_wtime(); - try{ - //MemOpt - FillPatchSingleLevelPull (dest, time, smf, stime, destGraph, fsrcGraph, f, SComp, DComp, NComp, *geom, *physbcf, singleT); - //amrex::FillPatchSingleLevelPull (m_fabs, time, smf, stime, destGraph, fsrcGraph, f, tid, SComp, DComp, NComp, *geom, *physbcf, singleT); - } - catch(std::exception& e){ - std::cout<< "AFPI_Receive_FPSLPull: Proc " <& smf, const Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - MultiFab *dmf, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT) - { - BL_PROFILE("FillPatchSingleLevel"); - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - int tg = perilla::wid(); - int nt = perilla::wtid(); - - if (smf.size() == 1) - { - //mf.copy(smf[0], scomp, dcomp, ncomp, 0, mf.nGrow(), geom.periodicity()); - Perilla::multifabCopyPush(destGraph, srcGraph, &mf, smf[0], f, dcomp, scomp, ncomp, mf.nGrow(), 0, singleT); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - //PArray raii(PArrayManage); - //MultiFab * dmf; - int destcomp; - bool sameba; - //if (false && mf.boxArray() == smf[0]->boxArray()) - if (mf.boxArray() == smf[0]->boxArray()) - { - std::cout << "FillPatchUtil SLPush Nt Handled" << std::endl; - - //dmf = &mf; - destcomp = dcomp; - sameba = true; - - int fis = smf[0]->IndexArray()[f]; - int fid = mf.IndexArray()[f]; - - const Box& bx = mf[fid].box(); - mf[fid].linInterp(smf[0]->get(fis), - scomp, - smf[1]->get(fis), - scomp, - stime[0], - stime[1], - time, - bx, - destcomp, - ncomp); - Perilla::fillBoundaryPush(destGraph, &mf, f); - } - else - { - destcomp = 0; - sameba = false; - - int fis = smf[0]->IndexArray()[f]; - int fid = dmf->IndexArray()[f]; - - for(int t=0; tfabTiles[f]->numTiles; t++) - if( singleT || t % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == nt) - { - const Box& bx = *(srcGraph->fabTiles[f]->tileBx[t]); - if(bx.ok()) - (*dmf)[fid].linInterp(smf[0]->get(fis), - scomp, - smf[1]->get(fis), - scomp, - stime[0], - stime[1], - time, - bx, - destcomp, - ncomp); - } - if(!singleT) - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - int src_ngrow = 0; - int dst_ngrow = mf.nGrow(); - Perilla::multifabCopyPush( destGraph, srcGraph, &mf, dmf, f, dcomp, 0, ncomp, mf.nGrow(), 0 ,singleT); - - } - } - else - { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - } - - void AsyncFillPatchIterator::FillPatchSingleLevelPull (MultiFab& mf, Real time, - Vector& smf, const Vector& stime, - RegionGraph* destGraph, RegionGraph* srcGraph, int f, - int scomp, int dcomp, int ncomp, - const Geometry& geom, StateDataPhysBCFunct& physbcf, bool singleT) - { - BL_PROFILE("FillPatchSingleLevel"); - - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - int tg = perilla::wid(); - - if (smf.size() == 1) - { - //mf.copy(smf[0], scomp, dcomp, ncomp, 0, mf.nGrow(), geom.periodicity()); - Perilla::multifabCopyPull( destGraph, srcGraph, &mf, smf[0], f, dcomp, scomp, ncomp, mf.nGrow(), 0, singleT); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - Vector raii; - MultiFab * dmf; - int destcomp; - bool sameba; - //if (false && mf.boxArray() == smf[0]->boxArray()) { - if (mf.boxArray() == smf[0]->boxArray()) { - //dmf = &mf; - destcomp = dcomp; - sameba = true; - } else { - - //dmf = srcGraph->assocMF; - destcomp = 0; - sameba = false; - } - if (sameba) - { - // Note that when sameba is true mf's BoxArray is nonoverlapping. - // So FillBoundary is safe. - //mf.FillBoundary(dcomp,ncomp,geom.periodicity()); - - Perilla::fillBoundaryPull(destGraph, &mf, f, singleT); - - //std::cout << "After sameba fBPull" << std::endl; - } - else - { - int src_ngrow = 0; - int dst_ngrow = mf.nGrow(); - MultiFab* dummyMF; - - //mf.copy(*dmf, 0, dcomp, ncomp, src_ngrow, dst_ngrow, geom.periodicity()); - - Perilla::multifabCopyPull( destGraph, srcGraph, &mf, dummyMF, f, dcomp, 0, ncomp, mf.nGrow(), 0, singleT); - - } - } - else { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - - - void AsyncFillPatchIterator::FillPatchTwoLevelsPush (Amr& amr, MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - AsyncFillPatchIterator* fpIter, - MultiFab *dmf, - MultiFab *dmff, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, unsigned char pushLevel, bool singleT) - { - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - if(f>=0){//fill only this fab - if(pushLevel & 0x01 ) - { - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - - if (!fpIter->m_fpc->ba_crse_patch.empty()) - { - FillPatchSingleLevelPush(amr, *(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, f, dmf, scomp, 0, ncomp, cgeom, cbc, singleT); - } - } - } - if((pushLevel & 0x02) && (pushLevel != 0x03)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, f, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - }else{ //fill the whole multifab - if(pushLevel & 0x01 && pushLevel & 0x02) - { - int tg = perilla::wid(); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - if(pushLevel & 0x04) - { - int tg = perilla::wid(); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - } - -#if 0 - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - if(pushLevel & 0x01 ) - { - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - if ( ! fpIter->m_fpc->ba_crse_patch.empty()) - { - FillPatchSingleLevelPush(amr, *(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, f, tid, dmf, scomp, 0, ncomp, cgeom, cbc, singleT); - - } - } - - - if(tf == 0 && (pushLevel & 0x02) ) - { - int tg = WorkerThread::groupID(tid); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, tid, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - } - if(tf == 0 && (pushLevel & 0x04) ) - { - int tg = WorkerThread::groupID(tid); - for(int fi=0; fi < fmf[0]->IndexArray().size(); fi++) - { - if(WorkerThread::isMyRegion(tg,fi)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, fi, tid, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - } - } - - if((pushLevel & 0x02) && (pushLevel != 0x03)) - { - FillPatchSingleLevelPush(amr, mf, time, fmf, ft, destGraph, fsrcGraph, f, tid, dmff, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } -#endif - } - - - void AsyncFillPatchIterator::FillPatchTwoLevelsPull (MultiFab& mf, Real time, - Vector& cmf, Vector& ct, - Vector& fmf, Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, - AsyncFillPatchIterator* fpIter, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - StateDataPhysBCFunct& cbc, StateDataPhysBCFunct& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, bool singleT) - { - BL_PROFILE("FillPatchTwoLevels"); - int ngrow = mf.nGrow(); - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - - if ( ! fpIter->m_fpc->ba_crse_patch.empty()) - { - - int idummy1=0, idummy2=0; - bool cc = fpIter->m_fpc->ba_crse_patch.ixType().cellCentered(); - { - int gi = mf.IndexArray()[f]; - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - FillPatchSingleLevelPull(*(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, li, scomp, 0, ncomp, cgeom, cbc, singleT); - } - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - int nt = WorkerThread::perilla_wtid(); - Box fdomain = fgeom.Domain(); - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - if(singleT) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Array bcr(ncomp); - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - else - { - if(!cc) - { - if(WorkerThread::perilla_isMasterWorkerThread()) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Box fdomain = fgeom.Domain(); - - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - - } - } - else - { - - for(int j=0; j < fpIter->m_rg_crse_patch->fabTiles[li]->numTiles; j++) - if(j % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == nt) - { - const Box& dbx = *(fpIter->m_rg_crse_patch->fabTiles[li]->tileBx[j]); - if(dbx.ok()) - { - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - } - - } - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } - } - - FillPatchSingleLevelPull(mf, time, fmf, ft, destGraph, fsrcGraph, f, scomp, dcomp, ncomp, fgeom, fbc, singleT); - } - -#if 0 - void AsyncFillPatchIterator::FillPatchTwoLevelsPull (MultiFab& mf, Real time, - Vector& cmf, const Vector& ct, - Vector& fmf, const Vector& ft, - RegionGraph* destGraph, RegionGraph* csrcGraph, RegionGraph* fsrcGraph, int f, int tid, - AsyncFillPatchIterator* fpIter, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs, bool singleT) - { - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - int tg = perilla:wid();//WorkerThread::groupID(tid); - int nt = perilla::wtid();//WorkerThread::numaTID(tid); - - - int myProc = ParallelDescriptor::MyProc(); - //std::ofstream fout; - //fout.open(std::to_string(myProc)+ "_" + std::to_string(tid) + ".txt", std::fstream::app); - - - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - - //fout << "FPTL fpIter->m_fpc->ba_crse_patch.empty() " << fpIter->m_fpc->ba_crse_patch.empty() << std::endl; - - if ( ! fpIter->m_fpc->ba_crse_patch.empty()) - { - - int idummy1=0, idummy2=0; - bool cc = fpIter->m_fpc->ba_crse_patch.ixType().cellCentered(); - - //std::cout << "Check CC : " << cc << std::endl; - //#ifdef _OPENMP - //#pragma omp parallel if (cc) - //#endif - - - //for (MFIter mfi(*(fpIter->m_mf_crse_patch),false,false); mfi.isValid(); ++mfi) - { - //int li = mfi.LocalIndex(); - //int gi = fpIter->m_fpc->dst_idxs[li]; - //if(gi == mf.IndexArray()[f]) - int gi = mf.IndexArray()[f]; - //if(gi == f) - - // fout << "FPTL gi " << gi << " f " << f << " destGraph->task[f]->depTaskIDs.size() " << destGraph->task[f]->depTaskIDs.size() << std::endl; - - //double start_time_wtime = omp_get_wtime(); - - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - - //fout << "Calling FPSL for dependent gi "<< gi << " li " << li << std::endl; - FillPatchSingleLevelPull(*(fpIter->m_mf_crse_patch), time, cmf, ct, fpIter->m_rg_crse_patch, csrcGraph, li, tid, scomp, 0, ncomp, cgeom, cbc, singleT); - } - //if(!singleT) - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - //double end_time_wtime = omp_get_wtime(); - /*if(singleT) - Perilla::getPPPTimeSplit[8] += end_time_wtime - start_time_wtime; - else - if(WorkerThread::isTeamMasterThread(tid)) - Perilla::getPPPTimeSplit[8] += end_time_wtime - start_time_wtime; - */ - //if(myProc == 0 && nt == 4) - //std::cout << "CC " << cc << " DepTasks " << destGraph->task[f]->depTaskIDs.size() << std::endl; - - - //start_time_wtime = omp_get_wtime(); - int nt = perilla::wtid();//WorkerThread::numaTID(tid); - Box fdomain = fgeom.Domain(); - for(int i=0; itask[f]->depTaskIDs.size();i++) - { - int li = destGraph->task[f]->depTaskIDs[i]; - int mfi = fpIter->m_mf_crse_patch[0].IndexArray()[li]; - /* - const Box& dbx1 = fpIter->m_fpc->dst_boxes[li]; - - std::ofstream fout; - fout.open(std::to_string(myProc)+ "_" + std::to_string(tid) + ".txt", std::fstream::app); - fout << "i "<< i << " depsiz " << destGraph->task[f]->depTaskIDs.size() <<" li "<< li << " f "<< f << " mfi "<< mfi << " gi " << gi << std::endl; - fout <<" numFabs " << fpIter->m_rg_crse_patch->fabTiles.size() << " numTls " << fpIter->m_rg_crse_patch->fabTiles[li]->numTiles << " ndbs " << fpIter->m_fpc->dst_boxes.size() << std::endl; - //fout <<"dbx " << dbx << std::endl; - fout <<"dst_bxs " << dbx1 << std::endl; - fout <<"fine bx " << mf[gi].box() << std::endl; - fout <<"crse bx " << fpIter->m_mf_crse_patch[0][mfi].box() << std::endl; - fout.close(); - */ - - if(singleT) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Box fdomain = fgeom.Domain(); - - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - else - { - if(!cc) - { - if(WorkerThread::isTeamMasterThread(tid)) - { - const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - //Box fdomain = fgeom.Domain(); - - Array bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - - } - } - else - { - //std::cout << "myP " << myProc << " nt "<< nt << " li "<< li << " mfi " << mfi << " ntiles " << fpIter->m_rg_crse_patch->fabTiles.size()<< std::endl; - ///for(int j=0; j < fpIter->m_rg_crse_patch->fabTiles[f]->numTiles; j++) - for(int j=0; j < fpIter->m_rg_crse_patch->fabTiles[li]->numTiles; j++) - if(j % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == nt-perilla::NUM_COMM_THREADS) - ///if(i % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == nt-perilla::NUM_COMM_THREADS) - { - - //if(myProc == 0 && nt == 4) - //std::cout << "CC " << cc << " DepTasks " << destGraph->task[f]->depTaskIDs.size() << " i " << i << std::endl; - ///const Box& dbx = fpIter->m_fpc->dst_boxes[li]; - ///const Box& dbx = *(fpIter->m_rg_crse_patch->fabTiles[f]->tileBx[j]); - const Box& dbx = *(fpIter->m_rg_crse_patch->fabTiles[li]->tileBx[j]); - //Box fdomain = fgeom.Domain(); - - //if(myProc == 0 && nt == 4) - //std::cout << "CC " << cc << " DepTasks " << destGraph->task[f]->depTaskIDs.size() << " i " << i << " dbx " << dbx << std::endl; - - - //fout << "FPTL interping gi "<< gi << " li " << li<< " dbx " << dbx << std::endl; - if(dbx.ok()) - { - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - mapper->interp(fpIter->m_mf_crse_patch[0][mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - } - } - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - if(!singleT) - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - /* - end_time_wtime = omp_get_wtime(); - if(singleT) - Perilla::getPPPTimeSplit[9] += end_time_wtime - start_time_wtime; - else - if(WorkerThread::isTeamMasterThread(tid)) - Perilla::getPPPTimeSplit[9] += end_time_wtime - start_time_wtime; - */ - } - } - } - /* - int mfi = mf.IndexArray()[f]; - const Box& bx = mf[mfi].box(); - //if(mfi == 0 ) - { - fout << "Before second FPSL at FPSL mfi " << mfi << " bx " << bx.smallEnd() << bx.bigEnd() << std::endl; - for(int i=bx.smallEnd(0); i<=bx.smallEnd(0); i++) - { - for(int j=bx.smallEnd(1); j<=bx.bigEnd(1); j++) - { - for(int k=bx.smallEnd(2); k<=bx.bigEnd(2); k++) - { - fout << mf[mfi](IntVect(i,j,k)) << " "; - } - fout << std::endl; - } - fout << std::endl; - } - } - */ - //fout.close(); - //double start_time_wtime = omp_get_wtime(); - - FillPatchSingleLevelPull(mf, time, fmf, ft, destGraph, fsrcGraph, f, tid, scomp, dcomp, ncomp, fgeom, fbc, singleT); - /* - double end_time_wtime = omp_get_wtime(); - if(singleT) - Perilla::getPPPTimeSplit[10] += end_time_wtime - start_time_wtime; - else - if(WorkerThread::isTeamMasterThread(tid)) - Perilla::getPPPTimeSplit[10] += end_time_wtime - start_time_wtime; - */ - } - - -#endif - - - - void AsyncFillPatchIterator::initialSend(Vector afpi, - Vector upper_afpi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iteration) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - int level = afpi[iteration-1]->m_amrlevel.level; - if(level == 0 && iteration == 1) - { - int tg = perilla::wid(); - for(int f=0; f < afpi[iteration-1]->m_fabs.IndexArray().size(); f++) - { - if(WorkerThread::isMyRegion(tg, f)) - { - for(int i=0; i < afpi[iteration-1]->m_amrlevel.parent->nCycle(level); i++){ - //fill neighbor fabs of the same AMR level - afpi[i]->PushOnly( boxGrow, time+(i*afpi[iteration-1]->m_amrlevel.parent->dtLevel(level)), state_indx, scomp, ncomp, f, 0xFF, false); - } - } - } - } - - if(level < afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - int i = 0; - unsigned char tuc = 0x04; - //init Fill Patch at the next finer AMR level - upper_afpi[i]->PushOnly(boxGrow, time+(i*afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)), state_indx, scomp, ncomp, -1/* all FABs*/, tuc, false); - } - } - - void AsyncFillPatchIterator::Receive (RGIter& rgi, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) - { - if(rgi.currentItr != 1) - return; - - PullOnly(boxGrow, time, index, scomp, ncomp, f, singleT); - } - - void AsyncFillPatchIterator::Receive (RGIter* rgi, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) - { - if(rgi->currentItr != 1) - return; - - PullOnly(boxGrow, time, index, scomp, ncomp, f, singleT); - } - - - void AsyncFillPatchIterator::Receive (RGIter& rgi, - MultiFab& dest, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) - { - if(rgi.currentItr != 1) - return; - - PullOnly(dest, boxGrow, time, index, scomp, ncomp, f, singleT); - } - - - void AsyncFillPatchIterator::Receive (RGIter* rgi, - MultiFab& dest, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int f, - bool singleT) - { - if(rgi->currentItr != 1) - return; - - PullOnly(dest, boxGrow, time, index, scomp, ncomp, f, singleT); - } - }//end amrex namespace - -#endif - //end USE_PERILLA diff --git a/Src/Amr/AMReX_Extrapolater.cpp b/Src/Amr/AMReX_Extrapolater.cpp index c4e97decfc7..8ce3c6974d5 100644 --- a/Src/Amr/AMReX_Extrapolater.cpp +++ b/Src/Amr/AMReX_Extrapolater.cpp @@ -1,53 +1,51 @@ #include +#include #include #ifdef _OPENMP #include #endif -extern "C" -{ - void amrex_first_order_extrap(amrex::Real* u, const int* ulo, const int* uhi, const int& nu, - const int* msk, const int* mlo, const int* mhi, - const int* lo, const int* hi, - const int& scomp, const int& ncomp); -} - namespace amrex { namespace Extrapolater { void FirstOrderExtrap (MultiFab& mf, const Geometry& geom, int scomp, int ncomp) { - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO gpu + BL_ASSERT(mf.nGrow() == 1); + BL_ASSERT(scomp >= 0); + BL_ASSERT((scomp+ncomp) <= mf.nComp()); - BL_ASSERT(mf.nGrow() == 1); - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp <= mf.nComp()); - - iMultiFab mask(mf.boxArray(), mf.DistributionMap(), 1, 1, MFInfo(), + iMultiFab mask(mf.boxArray(), mf.DistributionMap(), 1, 1, MFInfo(), DefaultFabFactory()); - mask.BuildMask(geom.Domain(), geom.periodicity(), - finebnd, crsebnd, physbnd, interior); - - int N = mf.nComp(); + mask.BuildMask(geom.Domain(), geom.periodicity(), + finebnd, crsebnd, physbnd, interior); #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (Gpu::notInLaunchRegion()) #endif - for (MFIter mfi(mf); mfi.isValid(); ++mfi) - { - const Box& bx = mfi.validbox(); - const IArrayBox& maskfab = mask[mfi]; - const Box& maskbox = maskfab.box(); - FArrayBox& datafab = mf[mfi]; - const Box& databox = datafab.box(); - - amrex_first_order_extrap(datafab.dataPtr(), databox.loVect(), databox.hiVect(), N, - maskfab.dataPtr(), maskbox.loVect(), maskbox.hiVect(), - bx.loVect(), bx.hiVect(), scomp, ncomp); - } + for (MFIter mfi(mf); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.validbox(); + auto const& mask_arr = mask.const_array(mfi); + auto const& data_arr = mf.array(mfi,scomp); + + if (Gpu::inLaunchRegion()) { + ParallelFor(amrex::grow(bx,1), ncomp, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + if (mask_arr(i,j,k) == crsebnd) data_arr(i,j,k,n) = 0.0; + }); + ParallelFor(amrex::grow(bx,1), ncomp, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + amrex_first_order_extrap_gpu(i, j, k, n, bx, mask_arr, data_arr); + }); + } else { + amrex_first_order_extrap_cpu(bx, ncomp, mask_arr, data_arr); + } + } } } diff --git a/Src/Amr/AMReX_StateData.cpp b/Src/Amr/AMReX_StateData.cpp index 8014c4b38c0..b61dd8ced6a 100644 --- a/Src/Amr/AMReX_StateData.cpp +++ b/Src/Amr/AMReX_StateData.cpp @@ -1,9 +1,8 @@ #include +#include #include -#include - #include #include #include @@ -16,7 +15,11 @@ namespace amrex { -static constexpr Real INVALID_TIME = -1.0e200; +#ifdef AMREX_USE_FLOAT +static constexpr Real INVALID_TIME = -1.0e30; +#else +static constexpr Real INVALID_TIME = -1.0e200; +#endif static constexpr int MFNEWDATA = 0; static constexpr int MFOLDDATA = 1; @@ -460,7 +463,7 @@ StateData::FillBoundary (FArrayBox& dest, const int sc = src_comp+i; Real* dat = dest.dataPtr(dc); - if (desc->master(sc)) + if (desc->primary(sc)) { const int groupsize = desc->groupsize(sc); @@ -535,7 +538,7 @@ StateData::FillBoundary (Box const& bx, const int dc = dest_comp+i; const int sc = src_comp+i; - if (desc->master(sc)) + if (desc->primary(sc)) { const int groupsize = desc->groupsize(sc); diff --git a/Src/Amr/AMReX_StateDescriptor.H b/Src/Amr/AMReX_StateDescriptor.H index 2877e0493da..54417adc5ee 100644 --- a/Src/Amr/AMReX_StateDescriptor.H +++ b/Src/Amr/AMReX_StateDescriptor.H @@ -219,7 +219,7 @@ public: * \param bc * \param func * \param interp - * \param master_or_slave + * \param primary_or_secondary * \param groupsize */ void setComponent (int comp, @@ -227,7 +227,7 @@ public: const BCRec& bc, const BndryFunc& func, Interpolater* interp, - bool master_or_slave, + bool primary_or_secondary, int groupsize); /** @@ -380,7 +380,7 @@ public: */ bool store_in_checkpoint () const noexcept; - bool master (int i) const noexcept { return m_master[i]; } + bool primary (int i) const noexcept { return m_primary[i]; } int groupsize (int i) const noexcept { return m_groupsize[i]; } @@ -412,8 +412,8 @@ private: Vector names; //!< Printable names of components Vector bc; //!< Array of bndry types for entire level Vector > bc_func; //!< Array of pointers to bndry fill functions - Vector m_master; //!< Are we a master or slave? (true or false) - Vector m_groupsize; //!< Groupsize if we're a master + Vector m_primary; //!< Are we a primary or secondary? (true or false) + Vector m_groupsize; //!< Groupsize if we're a primary /** * \brief If mapper_comp[icomp] != 0, that map is used instead of mapper diff --git a/Src/Amr/AMReX_StateDescriptor.cpp b/Src/Amr/AMReX_StateDescriptor.cpp index a978b039def..fb62b4d7305 100644 --- a/Src/Amr/AMReX_StateDescriptor.cpp +++ b/Src/Amr/AMReX_StateDescriptor.cpp @@ -20,8 +20,8 @@ StateDescriptor::BndryFunc::clone () const StateDescriptor::BndryFunc::~BndryFunc () {} bool -StateDescriptor::bf_thread_safety (const int* lo,const int* hi, - const int* dom_lo, const int* dom_hi, +StateDescriptor::bf_thread_safety (const int* /*lo*/,const int* /*hi*/, + const int* /*dom_lo*/, const int* /*dom_hi*/, const int* bc, int ng) { bool thread_safe = true; @@ -39,26 +39,26 @@ void StateDescriptor::BndryFunc::operator () (Real* data,const int* lo,const int* hi, const int* dom_lo, const int* dom_hi, const Real* dx, const Real* grd_lo, - const Real* time, const int* bc) const + const Real* time, const int* a_bc) const { BL_ASSERT(m_func != 0 || m_func3D != 0); - bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, bc, 1); + bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, a_bc, 1); if (thread_safe) { if (m_func != 0) - m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,bc); + m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc); else m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi), - AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,bc); + AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc); } else { #ifdef _OPENMP #pragma omp critical (bndryfunc) #endif if (m_func != 0) - m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,bc); + m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc); else m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi), - AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,bc); + AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc); } } @@ -66,26 +66,26 @@ void StateDescriptor::BndryFunc::operator () (Real* data,const int* lo,const int* hi, const int* dom_lo, const int* dom_hi, const Real* dx, const Real* grd_lo, - const Real* time, const int* bc, int ng) const + const Real* time, const int* a_bc, int ng) const { BL_ASSERT(m_gfunc != 0 || m_gfunc3D != 0); - bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, bc, ng); + bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, a_bc, ng); if (thread_safe) { if (m_gfunc != 0) - m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,bc); + m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc); else m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi), - AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,bc); + AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc); } else { #ifdef _OPENMP #pragma omp critical (bndryfunc) #endif if (m_gfunc != 0) - m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,bc); + m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc); else m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi), - AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,bc); + AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc); } } @@ -147,9 +147,9 @@ DescriptorList::setComponent (int indx, { for (int i = 0; i < nm.size(); i++) { - const bool master = (i == 0) ? true : false; + const bool is_primary = (i == 0) ? true : false; - desc[indx]->setComponent(comp+i,nm[i],bc[i],func,interp,master,nm.size()); + desc[indx]->setComponent(comp+i,nm[i],bc[i],func,interp,is_primary,nm.size()); } } @@ -210,7 +210,7 @@ StateDescriptor::StateDescriptor (IndexType btyp, bc.resize(num_comp); bc_func.resize(num_comp); mapper_comp.resize(num_comp); - m_master.resize(num_comp); + m_primary.resize(num_comp); m_groupsize.resize(num_comp); max_map_start_comp.resize(num_comp); min_map_end_comp.resize(num_comp); @@ -336,7 +336,7 @@ StateDescriptor::define (IndexType btyp, bc.resize(num_comp); bc_func.resize(num_comp); mapper_comp.resize(num_comp); - m_master.resize(num_comp); + m_primary.resize(num_comp); m_groupsize.resize(num_comp); max_map_start_comp.resize(num_comp); min_map_end_comp.resize(num_comp); @@ -356,7 +356,7 @@ StateDescriptor::setComponent (int comp, names[comp] = nm; bc[comp] = bcr; mapper_comp[comp] = a_interp; - m_master[comp] = false; + m_primary[comp] = false; m_groupsize[comp] = 0; if (max_map_start_comp_>=0 && min_map_end_comp_>=0) @@ -381,12 +381,12 @@ StateDescriptor::setComponent (int comp, const BCRec& bcr, const StateDescriptor::BndryFunc& func, Interpolater* a_interp, - bool a_master, + bool a_primary, int a_groupsize) { setComponent(comp,nm,bcr,func,a_interp,-1,-1); - m_master[comp] = a_master; + m_primary[comp] = a_primary; m_groupsize[comp] = a_groupsize; } diff --git a/Src/Amr/AMReX_extrapolater_1D_K.H b/Src/Amr/AMReX_extrapolater_1D_K.H new file mode 100644 index 00000000000..5e939275156 --- /dev/null +++ b/Src/Amr/AMReX_extrapolater_1D_K.H @@ -0,0 +1,55 @@ +#ifndef AMReX_extrapolater_1D_K_H_ +#define AMReX_extrapolater_1D_K_H_ + +namespace amrex { + +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_cpu(amrex::Box const& bx, + int nComp, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + if (mask(lo.x-1,lo.y,lo.z) == crsecell) { + for (int n = 0; n < nComp; n++) { + data(lo.x-1,lo.y,lo.z,n) = data(lo.x,lo.y,lo.z,n); + } + } + if (mask(hi.x+1,hi.y,hi.z) == crsecell) { + for (int n = 0; n < nComp; n++) { + data(hi.x+1,hi.y,hi.z,n) = data(hi.x,hi.y,hi.z,n); + } + } + +} + +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_gpu(int i, int j, int k, int n, + amrex::Box const& bx, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + if ( (i == lo.x-1) && ( mask(i,j,k) == crsecell ) ) { + data(i,j,k,n) = data(i+1,j,k,n); + } + + if ( (i == hi.x+1) && ( mask(i,j,k) == crsecell ) ) { + data(i,j,k,n) = data(i-1,j,k,n); + } +} + +} +#endif diff --git a/Src/Amr/AMReX_extrapolater_1d.f90 b/Src/Amr/AMReX_extrapolater_1d.f90 deleted file mode 100644 index 45d20b935dc..00000000000 --- a/Src/Amr/AMReX_extrapolater_1d.f90 +++ /dev/null @@ -1,33 +0,0 @@ - -module amrex_extrapolater - - use amrex_fort_module, only : amrex_real - - implicit none - integer, parameter :: finecell = 1 ! must be consistent with Extrapolater.H - integer, parameter :: crsecell = 0 - -contains - - subroutine amrex_first_order_extrap (u, ulo, uhi, nu, msk, mlo, mhi, lo, hi, sc, nc) & - bind(c,name='amrex_first_order_extrap') - - integer, intent(in) :: ulo(1), uhi(1), nu, mlo(1), mhi(1), lo(1), hi(1), sc, nc - real(amrex_real), intent(inout) :: u(ulo(1):uhi(1),0:nu-1) - integer , intent(in) :: msk(mlo(1):mhi(1)) - - integer :: n - - do n = sc, sc+nc-1 - if (msk(lo(1)-1) .eq. crsecell) then - u(lo(1)-1,n) = u(lo(1),n) - end if - - if (msk(hi(1)+1) .eq. crsecell) then - u(hi(1)+1,n) = u(hi(1),n) - end if - end do - - end subroutine amrex_first_order_extrap - -end module amrex_extrapolater diff --git a/Src/Amr/AMReX_extrapolater_2D_K.H b/Src/Amr/AMReX_extrapolater_2D_K.H new file mode 100644 index 00000000000..be1d30ec6b1 --- /dev/null +++ b/Src/Amr/AMReX_extrapolater_2D_K.H @@ -0,0 +1,210 @@ +#ifndef AMReX_extrapolater_2D_K_H_ +#define AMReX_extrapolater_2D_K_H_ + +namespace amrex { + +AMREX_GPU_HOST +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_cpu(amrex::Box const& bx, + int nComp, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + using namespace amrex::literals; + + constexpr int finecell = 1; + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + int k = lo.z; + + for (int n = 0; n < nComp; n++) { + + // set all crse cells to zero first + for (int j = lo.y-1; j <= hi.y+1; ++j) { + for (int i = lo.x-1; i <= hi.x+1; ++i) { + if (mask(i,j,k) == crsecell) data(i,j,k,n) = Real(0.0); + } + } + + // Corners + // xlo, ylo + { + int i = lo.x-1; + int j = lo.y-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k,n); + } + } + } + // xlo, yhi + { + int i = lo.x-1; + int j = hi.y+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + mask(i,j-1,k) * data(i,j-1,k,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k,n); + } + } + } + // xhi, ylo + { + int i = hi.x+1; + int j = lo.y-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k,n); + } + } + } + // xhi, yhi + { + int i = hi.x+1; + int j = hi.y+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i,j-1,k) * data(i,j-1,k,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k,n); + } + } + } + // Edges + // xlo, y-valid + { + int i = lo.x-1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i,j-1,k) * data(i,j-1,k,n) + data(i+1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i,j-1,k) + 1 + mask(i,j+1,k) ); + } + } + } + // xhi, y-valid + { + int i = hi.x+1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i,j-1,k) * data(i,j-1,k,n) + data(i-1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i,j-1,k) + 1 + mask(i,j+1,k) ); + } + } + } + // x-valid, ylo + { + int j = lo.y-1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i+1,j,k) * data(i+1,j,k,n) + data(i,j+1,k,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1 ); + } + } + } + // x-valid, yhi + { + int j = hi.y+1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i+1,j,k) * data(i+1,j,k,n) + data(i,j-1,k,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1 ); + } + } + } + } +} + +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_gpu(int i, int j, int k, int n, + amrex::Box const& bx, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + using namespace amrex::literals; + + constexpr int finecell = 1; + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + if ( mask(i,j,k) == crsecell ) { + // Corners + // xlo, ylo + if ( (i == lo.x-1) && (j == lo.y-1) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k,n); + } + // xlo, yhi + } else if ( (i == lo.x-1) && (j == hi.y+1) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + mask(i,j-1,k) * data(i,j-1,k,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k,n); + } + // xhi, ylo + } else if ( (i == hi.x+1) && (j == lo.y-1) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k,n); + } + // xhi, yhi + } else if ( (i == hi.x+1) && (j == hi.y+1) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i,j-1,k) * data(i,j-1,k,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k,n); + } + // Edges + // xlo, y-valid + } else if ( (i == lo.x-1) && (j >= lo.y) && (j <= hi.y) ) { + data(i,j,k,n) = ( mask(i,j-1,k) * data(i,j-1,k,n) + data(i+1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i,j-1,k) + 1 + mask(i,j+1,k) ); + // xhi, y-valid + } else if ( (i == hi.x+1) && (j >= lo.y) && (j <= hi.y) ) { + data(i,j,k,n) = ( mask(i,j-1,k) * data(i,j-1,k,n) + data(i-1,j,k,n) + mask(i,j+1,k) * data(i,j+1,k,n) ) + / ( mask(i,j-1,k) + 1 + mask(i,j+1,k) ); + // x-valid, ylo + } else if ( (i >= lo.x) && (i <= hi.x) && (j == lo.y-1) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i+1,j,k) * data(i+1,j,k,n) + data(i,j+1,k,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1 ); + // x-valid, yhi + } else if ( (i >= lo.x) && (i <= hi.x) && (j == hi.y+1) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + mask(i+1,j,k) * data(i+1,j,k,n) + data(i,j-1,k,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1 ); + } + } +} + +} +#endif diff --git a/Src/Amr/AMReX_extrapolater_2d.f90 b/Src/Amr/AMReX_extrapolater_2d.f90 deleted file mode 100644 index 382c9eedb56..00000000000 --- a/Src/Amr/AMReX_extrapolater_2d.f90 +++ /dev/null @@ -1,119 +0,0 @@ -module amrex_extrapolater - - use amrex_fort_module, only : amrex_real - - implicit none - integer, parameter :: finecell = 1 ! must be consistent with Extrapolater.H - integer, parameter :: crsecell = 0 - - ! The value of msk is either 0 or 1. - -contains - - subroutine amrex_first_order_extrap (u, ulo, uhi, nu, msk, mlo, mhi, lo, hi, sc, nc) & - bind(c,name='amrex_first_order_extrap') - - integer, intent(in) :: ulo(2), uhi(2), nu, mlo(2), mhi(2), lo(2), hi(2), sc, nc - real(amrex_real), intent(inout) :: u(ulo(1):uhi(1),ulo(2):uhi(2),0:nu-1) - integer , intent(in) :: msk(mlo(1):mhi(1),mlo(2):mhi(2)) - - integer :: i, j, n - - do n = sc, sc+nc-1 - ! set all crse cells to zero first - do j = lo(2)-1, hi(2)+1 - do i = lo(1)-1, hi(1)+1 - if (msk(i,j) .eq. crsecell) then - u(i,j,n) = 0.d0 - end if - end do - end do - - ! ylo, xlo - j = lo(2)-1 - i = lo(1)-1 - if (msk(i,j) .eq. crsecell) then - if (msk(i,j+1) .eq. finecell .or. msk(i+1,j) .eq. finecell) then - u(i,j,n) = (msk(i,j+1)*u(i,j+1,n) + msk(i+1,j)*u(i+1,j,n)) & - & / (msk(i,j+1) + msk(i+1,j) ) - else - u(i,j,n) = u(i+1,j+1,n) - end if - end if - - ! ylo, x-valid - j = lo(2)-1 - do i = lo(1), hi(1) - if (msk(i,j) .eq. crsecell) then - u(i,j,n) = (msk(i-1,j)*u(i-1,j,n)+msk(i+1,j)*u(i+1,j,n)+u(i,j+1,n)) & - & / (msk(i-1,j) +msk(i+1,j) +1 ) - end if - end do - - ! ylo, xhi - j = lo(2)-1 - i = hi(1)+1 - if (msk(i,j) .eq. crsecell) then - if (msk(i-1,j).eq.finecell .or. msk(i,j+1).eq.finecell) then - u(i,j,n) = (msk(i-1,j)*u(i-1,j,n)+msk(i,j+1)*u(i,j+1,n)) & - & / (msk(i-1,j) +msk(i,j+1)) - else - u(i,j,n) = u(i-1,j+1,n) - end if - end if - - ! y-valid, xlo - i = lo(1)-1 - do j = lo(2), hi(2) - if (msk(i,j) .eq. crsecell) then - u(i,j,n) = (msk(i,j-1)*u(i,j-1,n)+u(i+1,j,n)+msk(i,j+1)*u(i,j+1,n)) & - & / (msk(i,j-1) +1 +msk(i,j+1) ) - end if - end do - - ! y-valid, xhi - i = hi(1)+1 - do j = lo(2), hi(2) - if (msk(i,j) .eq. crsecell) then - u(i,j,n) = (msk(i,j-1)*u(i,j-1,n)+u(i-1,j,n)+msk(i,j+1)*u(i,j+1,n)) & - & / (msk(i,j-1) +1 +msk(i,j+1) ) - end if - end do - - ! yhi, xlo - j = hi(2)+1 - i = lo(1)-1 - if (msk(i,j) .eq. crsecell) then - if (msk(i,j-1).eq.finecell .or. msk(i+1,j).eq.finecell) then - u(i,j,n) = (msk(i,j-1)*u(i,j-1,n)+msk(i+1,j)*u(i+1,j,n)) & - & / (msk(i,j-1) +msk(i+1,j) ) - else - u(i,j,n) = u(i+1,j-1,n) - end if - end if - - ! yhi, xvalid - j = hi(2)+1 - do i = lo(1), hi(1) - if (msk(i,j) .eq. crsecell) then - u(i,j,n) = (u(i,j-1,n)+msk(i-1,j)*u(i-1,j,n)+msk(i+1,j)*u(i+1,j,n)) & - & / (1 +msk(i-1,j) +msk(i+1,j) ) - end if - end do - - ! yhi, xhi - i = hi(1)+1 - j = hi(2)+1 - if (msk(i,j) .eq. crsecell) then - if (msk(i-1,j).eq.finecell .or. msk(i,j-1).eq.finecell) then - u(i,j,n) = (msk(i-1,j)*u(i-1,j,n)+msk(i,j-1)*u(i,j-1,n)) & - & / (msk(i-1,j) +msk(i,j-1) ) - else - u(i,j,n) = u(i-1,j-1,n) - end if - end if - end do - - end subroutine amrex_first_order_extrap - -end module amrex_extrapolater diff --git a/Src/Amr/AMReX_extrapolater_3D_K.H b/Src/Amr/AMReX_extrapolater_3D_K.H new file mode 100644 index 00000000000..4dd3be07219 --- /dev/null +++ b/Src/Amr/AMReX_extrapolater_3D_K.H @@ -0,0 +1,1124 @@ +#ifndef AMReX_extrapolater_3D_K_H_ +#define AMReX_extrapolater_3D_K_H_ + +namespace amrex { + +AMREX_GPU_HOST +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_cpu(amrex::Box const& bx, + int nComp, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + using namespace amrex::literals; + + constexpr int finecell = 1; + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + for (int n = 0; n < nComp; n++) { + + // set all crse cells to zero first + for (int k = lo.z-1; k <= hi.z+1; ++k) { + for (int j = lo.y-1; j <= hi.y+1; ++j) { + for (int i = lo.x-1; i <= hi.x+1; ++i) { + if (mask(i,j,k) == crsecell) data(i,j,k,n) = Real(0.0); + } + } + } + + // Corners + // xlo, ylo, zlo + { + int i = lo.x-1; + int j = lo.y-1; + int k = lo.z-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i+1,j+1,k) == finecell ) || + ( mask(i+1,j,k+1) == finecell ) || + ( mask(i,j+1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i+1,j+1,k) + mask(i+1,j,k+1) + mask(i,j+1,k+1) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k+1,n); + } + } + } + // xlo, ylo, zhi + { + int i = lo.x-1; + int j = lo.y-1; + int k = hi.z+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i+1,j+1,k) == finecell ) || + ( mask(i+1,j,k-1) == finecell ) || + ( mask(i,j+1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i+1,j+1,k) + mask(i+1,j,k-1) + mask(i,j+1,k-1) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k-1,n); + } + } + } + // xlo, yhi, zlo + { + int i = lo.x-1; + int j = hi.y+1; + int k = lo.z-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i+1,j-1,k) == finecell ) || + ( mask(i+1,j,k+1) == finecell ) || + ( mask(i,j-1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j,k+1) + mask(i,j-1,k+1) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k+1,n); + } + } + } + // xlo, yhi, zhi + { + int i = lo.x-1; + int j = hi.y+1; + int k = hi.z+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i+1,j-1,k) == finecell ) || + ( mask(i+1,j,k-1) == finecell ) || + ( mask(i,j-1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j,k-1) + mask(i,j-1,k-1) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k-1,n); + } + } + } + // xhi, ylo, zlo + { + int i = hi.x+1; + int j = lo.y-1; + int k = lo.z-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i-1,j+1,k) == finecell ) || + ( mask(i-1,j,k+1) == finecell ) || + ( mask(i,j+1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i-1,j+1,k) + mask(i-1,j,k+1) + mask(i,j+1,k+1) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k+1,n); + } + } + } + // xhi, ylo, zhi + { + int i = hi.x+1; + int j = lo.y-1; + int k = hi.z+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i-1,j+1,k) == finecell ) || + ( mask(i-1,j,k-1) == finecell ) || + ( mask(i,j+1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i-1,j+1,k) + mask(i-1,j,k-1) + mask(i,j+1,k-1) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k-1,n); + } + } + } + // xhi, yhi, zlo + { + int i = hi.x+1; + int j = hi.y+1; + int k = lo.z-1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i-1,j-1,k) == finecell ) || + ( mask(i-1,j,k+1) == finecell ) || + ( mask(i,j-1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j,k+1) + mask(i,j-1,k+1) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k+1,n); + } + } + } + // xhi, yhi, zhi + { + int i = hi.x+1; + int j = hi.y+1; + int k = hi.z+1; + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i-1,j-1,k) == finecell ) || + ( mask(i-1,j,k-1) == finecell ) || + ( mask(i,j-1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j,k-1) + mask(i,j-1,k-1) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k-1,n); + } + } + } + // Edges + // xlo, ylo, z-valid + { + int i = lo.x-1; + int j = lo.y-1; + for (int k = lo.z; k <= hi.z; ++k) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i+1,j+1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( 1.0 + mask(i+1,j,k-1) + mask(i+1,j,k+1) + mask(i,j+1,k-1) + mask(i,j+1,k+1) ); + } + } + } + } + // xlo, yhi, z-valid + { + int i = lo.x-1; + int j = hi.y+1; + for (int k = lo.z; k <= hi.z; ++k) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i+1,j-1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( 1.0 + mask(i+1,j,k-1) + mask(i+1,j,k+1) + mask(i,j-1,k-1) + mask(i,j-1,k+1) ); + } + } + } + } + // xhi, ylo, z-valid + { + int i = hi.x+1; + int j = lo.y-1; + for (int k = lo.z; k <= hi.z; ++k) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i-1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( 1.0 + mask(i-1,j,k-1) + mask(i-1,j,k+1) + mask(i,j+1,k-1) + mask(i,j+1,k+1) ); + } + } + } + } + // xhi, yhi, z-valid + { + int i = hi.x+1; + int j = hi.y+1; + for (int k = lo.z; k <= hi.z; ++k) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i-1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( 1.0 + mask(i-1,j,k-1) + mask(i-1,j,k+1) + mask(i,j-1,k-1) + mask(i,j-1,k+1) ); + } + } + } + } + // xlo, y-valid, zlo + { + int i = lo.x-1; + int k = lo.z-1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + data(i+1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j+1,k) + 1.0 + mask(i,j-1,k+1) + mask(i,j+1,k+1) ); + } + } + } + } + // xlo, y-valid, zhi + { + int i = lo.x-1; + int k = hi.z+1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + data(i+1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j+1,k) + 1.0 + mask(i,j-1,k-1) + mask(i,j+1,k-1) ); + } + } + } + } + // xhi, y-valid, zlo + { + int i = hi.x+1; + int k = lo.z-1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + data(i-1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j+1,k) + 1.0 + mask(i,j-1,k+1) + mask(i,j+1,k+1) ); + } + } + } + } + // xhi, y-valid, zhi + { + int i = hi.x+1; + int k = hi.z+1; + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + data(i-1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j+1,k) + 1.0 + mask(i,j-1,k-1) + mask(i,j+1,k-1) ); + } + } + } + } + // x-valid, ylo, zlo + { + int j = lo.y-1; + int k = lo.z-1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + data(i,j+1,k+1,n) ) + / ( mask(i-1,j+1,k) + mask(i+1,j+1,k) + mask(i-1,j,k+1) + mask(i+1,j,k+1) + 1.0 ); + } + } + } + } + // x-valid, ylo, zhi + { + int j = lo.y-1; + int k = hi.z+1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + data(i,j+1,k-1,n) ) + / ( mask(i-1,j+1,k) + mask(i+1,j+1,k) + mask(i-1,j,k-1) + mask(i+1,j,k-1) + 1.0 ); + } + } + } + } + // x-valid, yhi, zlo + { + int j = hi.y+1; + int k = lo.z-1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + data(i,j-1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i+1,j-1,k) + mask(i-1,j,k+1) + mask(i+1,j,k+1) + 1.0 ); + } + } + } + } + // x-valid, yhi, zhi + { + int j = hi.y+1; + int k = hi.z+1; + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + data(i,j-1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i+1,j-1,k) + mask(i-1,j,k-1) + mask(i+1,j,k-1) + 1.0 ); + } + } + } + } + // Faces + // xlo, y-valid, z-valid + { + int i = lo.x-1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( 1.0 + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } + } + } + } + // xhi, y-valid, z-valid + { + int i = hi.x+1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( 1.0 + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } + } + } + } + // x-valid, ylo, z-valid + { + int j = lo.y-1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1.0 + mask(i,j,k-1) + mask(i,j,k+1) ); + } + } + } + } + // x-valid, yhi, z-valid + { + int j = hi.y+1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1.0 + mask(i,j,k-1) + mask(i,j,k+1) ); + } + } + } + } + // x-valid, y-valid, zlo + { + int k = lo.z-1; + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + 1.0 ); + } + } + } + } + // x-valid, y-valid, zhi + { + int k = hi.z+1; + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + if ( mask(i,j,k) == crsecell ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + 1.0 ); + } + } + } + } + } +} + +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +void +amrex_first_order_extrap_gpu(int i, int j, int k, int n, + amrex::Box const& bx, + amrex::Array4 const& mask, + amrex::Array4 const& data) noexcept +{ + using namespace amrex::literals; + + constexpr int finecell = 1; + constexpr int crsecell = 0; + + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + + if ( mask(i,j,k) == crsecell ) { + // Corners + // xlo, ylo, zlo + if ( ( i == lo.x-1) && ( j == lo.y-1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i+1,j+1,k) == finecell ) || + ( mask(i+1,j,k+1) == finecell ) || + ( mask(i,j+1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i+1,j+1,k) + mask(i+1,j,k+1) + mask(i,j+1,k+1) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k+1,n); + } + // xlo, ylo, zhi + } else if ( ( i == lo.x-1) && ( j == lo.y-1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i+1,j+1,k) == finecell ) || + ( mask(i+1,j,k-1) == finecell ) || + ( mask(i,j+1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i+1,j+1,k) + mask(i+1,j,k-1) + mask(i,j+1,k-1) ); + } else { + data(i,j,k,n) = data(i+1,j+1,k-1,n); + } + // xlo, yhi, zlo + } else if ( ( i == lo.x-1) && ( j == hi.y+1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i+1,j-1,k) == finecell ) || + ( mask(i+1,j,k+1) == finecell ) || + ( mask(i,j-1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j,k+1) + mask(i,j-1,k+1) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k+1,n); + } + // xlo, yhi, zhi + } else if ( ( i == lo.x-1) && ( j == hi.y+1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i+1,j-1,k) == finecell ) || + ( mask(i+1,j,k-1) == finecell ) || + ( mask(i,j-1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j,k-1) + mask(i,j-1,k-1) ); + } else { + data(i,j,k,n) = data(i+1,j-1,k-1,n); + } + // xhi, ylo, zlo + } else if ( ( i == hi.x+1) && ( j == lo.y-1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i-1,j+1,k) == finecell ) || + ( mask(i-1,j,k+1) == finecell ) || + ( mask(i,j+1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i-1,j+1,k) + mask(i-1,j,k+1) + mask(i,j+1,k+1) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k+1,n); + } + // xhi, ylo, zhi + } else if ( ( i == hi.x+1) && ( j == lo.y-1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i-1,j+1,k) == finecell ) || + ( mask(i-1,j,k-1) == finecell ) || + ( mask(i,j+1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i-1,j+1,k) + mask(i-1,j,k-1) + mask(i,j+1,k-1) ); + } else { + data(i,j,k,n) = data(i-1,j+1,k-1,n); + } + // xhi, yhi, zlo + } else if ( ( i == hi.x+1) && ( j == hi.y+1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else if ( ( mask(i-1,j-1,k) == finecell ) || + ( mask(i-1,j,k+1) == finecell ) || + ( mask(i,j-1,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j,k+1) + mask(i,j-1,k+1) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k+1,n); + } + // xhi, yhi, zhi + } else if ( ( i == hi.x+1) && ( j == hi.y+1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else if ( ( mask(i-1,j-1,k) == finecell ) || + ( mask(i-1,j,k-1) == finecell ) || + ( mask(i,j-1,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j,k-1) + mask(i,j-1,k-1) ); + } else { + data(i,j,k,n) = data(i-1,j-1,k-1,n); + } + // Edges + // xlo, ylo, z-valid + } else if ( ( i == lo.x-1) && ( j == lo.y-1 ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i+1,j+1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( 1.0 + mask(i+1,j,k-1) + mask(i+1,j,k+1) + mask(i,j+1,k-1) + mask(i,j+1,k+1) ); + } + // xlo, yhi, z-valid + } else if ( ( i == lo.x-1) && ( j == hi.y+1 ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i+1,j-1,k,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( 1.0 + mask(i+1,j,k-1) + mask(i+1,j,k+1) + mask(i,j-1,k-1) + mask(i,j-1,k+1) ); + } + // xhi, ylo, z-valid + } else if ( ( i == hi.x+1) && ( j == lo.y-1 ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i-1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( 1.0 + mask(i-1,j,k-1) + mask(i-1,j,k+1) + mask(i,j+1,k-1) + mask(i,j+1,k+1) ); + } + // xhi, yhi, z-valid + } else if ( ( i == hi.x+1) && ( j == hi.y+1 ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( data(i-1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) ) + / ( 1.0 + mask(i-1,j,k-1) + mask(i-1,j,k+1) + mask(i,j-1,k-1) + mask(i,j-1,k+1) ); + } + // xlo, y-valid, zlo + } else if ( ( i == lo.x-1) && ( j >= lo.y ) && + ( j <= hi.y ) && ( k == lo.z-1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + data(i+1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j+1,k) + 1.0 + mask(i,j-1,k+1) + mask(i,j+1,k+1) ); + } + // xlo, y-valid, zhi + } else if ( ( i == lo.x-1) && ( j >= lo.y ) && + ( j <= hi.y ) && ( k == hi.z+1 ) ) { + if ( ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + data(i+1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i+1,j-1,k) + mask(i+1,j+1,k) + 1.0 + mask(i,j-1,k-1) + mask(i,j+1,k-1) ); + } + // xhi, y-valid, zlo + } else if ( ( i == hi.x+1) && ( j >= lo.y ) && + ( j <= hi.y ) && ( k == lo.z-1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + data(i-1,j,k+1,n) + + mask(i,j-1,k+1) * data(i,j-1,k+1,n) + + mask(i,j+1,k+1) * data(i,j+1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j+1,k) + 1.0 + mask(i,j-1,k+1) + mask(i,j+1,k+1) ); + } + // xhi, y-valid, zhi + } else if ( ( i == hi.x+1) && ( j >= lo.y ) && + ( j <= hi.y ) && ( k == hi.z+1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + data(i-1,j,k-1,n) + + mask(i,j-1,k-1) * data(i,j-1,k-1,n) + + mask(i,j+1,k-1) * data(i,j+1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i-1,j+1,k) + 1.0 + mask(i,j-1,k-1) + mask(i,j+1,k-1) ); + } + // x-valid, ylo, zlo + } else if ( ( i >= lo.x) && ( i <= hi.x ) && + ( j == lo.y-1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + data(i,j+1,k+1,n) ) + / ( mask(i-1,j+1,k) + mask(i+1,j+1,k) + mask(i-1,j,k+1) + mask(i+1,j,k+1) + 1.0 ); + } + // x-valid, ylo, zhi + } else if ( ( i >= lo.x) && ( i <= hi.x ) && + ( j == lo.y-1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j+1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j+1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j+1,k) * data(i-1,j+1,k,n) + + mask(i+1,j+1,k) * data(i+1,j+1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + data(i,j+1,k-1,n) ) + / ( mask(i-1,j+1,k) + mask(i+1,j+1,k) + mask(i-1,j,k-1) + mask(i+1,j,k-1) + 1.0 ); + } + // x-valid, yhi, zlo + } else if ( ( i >= lo.x) && ( i <= hi.x ) && + ( j == hi.y+1 ) && ( k == lo.z-1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k+1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k+1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i-1,j,k+1) * data(i-1,j,k+1,n) + + mask(i+1,j,k+1) * data(i+1,j,k+1,n) + + data(i,j-1,k+1,n) ) + / ( mask(i-1,j-1,k) + mask(i+1,j-1,k) + mask(i-1,j,k+1) + mask(i+1,j,k+1) + 1.0 ); + } + // x-valid, yhi, zhi + } else if ( ( i >= lo.x) && ( i <= hi.x ) && + ( j == hi.y+1 ) && ( k == hi.z+1 ) ) { + if ( ( mask(i-1,j,k) == finecell ) || + ( mask(i+1,j,k) == finecell ) || + ( mask(i,j-1,k) == finecell ) || + ( mask(i,j,k-1) == finecell ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j,k-1) ); + } else { + data(i,j,k,n) = ( mask(i-1,j-1,k) * data(i-1,j-1,k,n) + + mask(i+1,j-1,k) * data(i+1,j-1,k,n) + + mask(i-1,j,k-1) * data(i-1,j,k-1,n) + + mask(i+1,j,k-1) * data(i+1,j,k-1,n) + + data(i,j-1,k-1,n) ) + / ( mask(i-1,j-1,k) + mask(i+1,j-1,k) + mask(i-1,j,k-1) + mask(i+1,j,k-1) + 1.0 ); + } + // Faces + // xlo, y-valid, z-valid + } else if ( ( i == lo.x-1) && + ( j >= lo.y ) && ( j <= hi.y ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + data(i,j,k,n) = ( data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( 1.0 + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + // xhi, y-valid, z-valid + } else if ( ( i == hi.x+1) && + ( j >= lo.y ) && ( j <= hi.y ) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + data(i,j,k,n) = ( data(i-1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( 1.0 + mask(i,j-1,k) + mask(i,j+1,k) + mask(i,j,k-1) + mask(i,j,k+1) ); + // x-valid, ylo, z-valid + } else if ( ( i >= lo.x ) && ( i <= hi.x ) && + ( j == lo.y-1) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + data(i,j+1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1.0 + mask(i,j,k-1) + mask(i,j,k+1) ); + // x-valid, yhi, z-valid + } else if ( ( i >= lo.x ) && ( i <= hi.x ) && + ( j == hi.y+1) && + ( k >= lo.z ) && ( k <= hi.z ) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + data(i,j-1,k,n) + + mask(i,j,k-1) * data(i,j,k-1,n) + + mask(i,j,k+1) * data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + 1.0 + mask(i,j,k-1) + mask(i,j,k+1) ); + // x-valid, y-valid, zlo + } else if ( ( i >= lo.x ) && ( i <= hi.x ) && + ( j >= lo.y ) && ( j <= hi.y ) && + ( k == lo.z-1) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + data(i,j,k+1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + 1.0 ); + // x-valid, y-valid, zhi + } else if ( ( i >= lo.x ) && ( i <= hi.x ) && + ( j >= lo.y ) && ( j <= hi.y ) && + ( k == hi.z+1) ) { + data(i,j,k,n) = ( mask(i-1,j,k) * data(i-1,j,k,n) + + mask(i+1,j,k) * data(i+1,j,k,n) + + mask(i,j-1,k) * data(i,j-1,k,n) + + mask(i,j+1,k) * data(i,j+1,k,n) + + data(i,j,k-1,n) ) + / ( mask(i-1,j,k) + mask(i+1,j,k) + mask(i,j-1,k) + mask(i,j+1,k) + 1.0 ); + } + } +} + +} +#endif diff --git a/Src/Amr/AMReX_extrapolater_3d.f90 b/Src/Amr/AMReX_extrapolater_3d.f90 deleted file mode 100644 index 6add0d72398..00000000000 --- a/Src/Amr/AMReX_extrapolater_3d.f90 +++ /dev/null @@ -1,656 +0,0 @@ - -module amrex_extrapolater - - use amrex_fort_module, only : amrex_real - - implicit none - integer, parameter :: finecell = 1 ! must be consistent with Extrapolater.H - integer, parameter :: crsecell = 0 - - ! The value of msk is either 0 or 1. - -contains - - subroutine amrex_first_order_extrap (u, ulo, uhi, nu, msk, mlo, mhi, lo, hi, sc, nc) & - bind(c,name='amrex_first_order_extrap') - - integer, intent(in) :: ulo(3), uhi(3), nu, mlo(3), mhi(3), lo(3), hi(3), sc, nc - real(amrex_real), intent(inout) :: u(ulo(1):uhi(1),ulo(2):uhi(2),ulo(3):uhi(3),0:nu-1) - integer , intent(in) :: msk(mlo(1):mhi(1),mlo(2):mhi(2),mlo(3):mhi(3)) - - integer :: i, j, k, n - - do n = sc, sc+nc-1 - ! set all crse cells to zero first - do k = lo(3)-1, hi(3)+1 - do j = lo(2)-1, hi(2)+1 - do i = lo(1)-1, hi(1)+1 - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = 0.d0 - end if - end do - end do - end do - - ! z-lo, y-lo, x-lo - i = lo(1)-1 - j = lo(2)-1 - k = lo(3)-1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i+1,j,k) + msk(i,j+1,k) + msk(i,j,k+1)) - - else if ( msk(i+1,j+1,k) .eq. finecell & - .or. msk(i+1,j,k+1) .eq. finecell & - .or. msk(i,j+1,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + msk(i,j+1,k+1) * u(i,j+1,k+1,n)) & - / (msk(i+1,j+1,k) + msk(i+1,j,k+1) + msk(i,j+1,k+1)) - else - u(i,j,k,n) = u(i+1,j+1,k+1,n) - end if - end if - - ! z-lo, y-lo, x-valid - j = lo(2)-1 - k = lo(3)-1 - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i+1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j+1,k) + msk(i,j,k+1)) - else - u(i,j,k,n) = (msk(i-1,j+1,k) * u(i-1,j+1,k,n) & - + msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + u(i,j+1,k+1,n)) / & - (msk(i-1,j+1,k) + msk(i+1,j+1,k) + msk(i-1,j,k+1) + msk(i+1,j,k+1) + 1) - end if - end if - end do - - ! z-lo, y-lo, x-hi - i = hi(1)+1 - j = lo(2)-1 - k = lo(3)-1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i,j+1,k) + msk(i,j,k+1)) - - else if ( msk(i-1,j+1,k) .eq. finecell & - .or. msk(i-1,j,k+1) .eq. finecell & - .or. msk(i,j+1,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j+1,k) * u(i-1,j+1,k,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i,j+1,k+1) * u(i,j+1,k+1,n)) & - / (msk(i-1,j+1,k) + msk(i-1,j,k+1) + msk(i,j+1,k+1)) - else - u(i,j,k,n) = u(i-1,j+1,k+1,n) - end if - end if - - ! z-lo, y-valid, x-lo - i = lo(1)-1 - k = lo(3)-1 - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) / & - (msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k+1)) - else - u(i,j,k,n) = (msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + u(i+1,j,k+1,n) & - + msk(i,j-1,k+1) * u(i,j-1,k+1,n) & - + msk(i,j+1,k+1) * u(i,j+1,k+1,n)) / & - (msk(i+1,j-1,k) + msk(i+1,j+1,k) + 1 + msk(i,j-1,k+1) + msk(i,j+1,k+1)) - end if - end if - end do - - ! z-lo, y-valid, x-valid - k = lo(3)-1 - do j = lo(2), hi(2) - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + u(i,j,k+1,n)) / & - (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + 1) - end if - end do - end do - - ! z-lo, y-valid, x-hi - i = hi(1)+1 - k = lo(3)-1 - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k+1)) - else - u(i,j,k,n) = (msk(i-1,j-1,k)*u(i-1,j-1,k,n) & - + msk(i-1,j+1,k)*u(i-1,j+1,k,n) & - + u(i-1,j,k+1,n) & - + msk(i,j-1,k+1)*u(i,j-1,k+1,n) & - + msk(i,j+1,k+1)*u(i,j+1,k+1,n)) / & - (msk(i-1,j-1,k) + msk(i-1,j+1,k) + 1 + msk(i,j-1,k+1) + msk(i,j+1,k+1)) - end if - end if - end do - - ! z-lo, y-hi, x-lo - i = lo(1)-1 - j = hi(2)+1 - k = lo(3)-1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j,k+1)) - - else if ( msk(i+1,j-1,k) .eq. finecell & - .or. msk(i+1,j,k+1) .eq. finecell & - .or. msk(i,j-1,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + msk(i,j-1,k+1) * u(i,j-1,k+1,n)) & - / (msk(i+1,j-1,k) + msk(i+1,j,k+1) + msk(i,j-1,k+1)) - else - u(i,j,k,n) = u(i+1,j-1,k+1,n) - end if - end if - - ! z-lo, y-hi, x-valid - j = hi(2)+1 - k = lo(3)-1 - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j,k+1)) - else - u(i,j,k,n) = (msk(i-1,j-1,k) * u(i-1,j-1,k,n) & - + msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + u(i,j-1,k+1,n)) / & - (msk(i-1,j-1,k) + msk(i+1,j-1,k) + msk(i-1,j,k+1) + msk(i+1,j,k+1) + 1) - end if - end if - end do - - ! z-lo, y-hi, x-hi - i = hi(1)+1 - j = hi(2)+1 - k = lo(3)-1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i,j-1,k) + msk(i,j,k+1)) - - else if ( msk(i-1,j-1,k) .eq. finecell & - .or. msk(i-1,j,k+1) .eq. finecell & - .or. msk(i,j-1,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j-1,k) * u(i-1,j-1,k,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i,j-1,k+1) * u(i,j-1,k+1,n)) & - / (msk(i-1,j-1,k) + msk(i-1,j,k+1) + msk(i,j-1,k+1)) - else - u(i,j,k,n) = u(i-1,j-1,k+1,n) - end if - end if - - ! z-valid, y-lo, x-lo - i = lo(1)-1 - j = lo(2)-1 - do k = lo(3), hi(3) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i+1,j,k) + msk(i,j+1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - else - u(i,j,k,n) = ( u(i+1,j+1,k,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n) & - + msk(i,j+1,k+1) * u(i,j+1,k+1,n)) / & - (1 + msk(i+1,j,k-1) + msk(i+1,j,k+1) + msk(i,j+1,k-1) + msk(i,j+1,k+1)) - end if - end if - end do - - ! z-valid, y-lo, x-valid - j = lo(2)-1 - do k = lo(3), hi(3) - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) / & - (msk(i-1,j,k) + msk(i+1,j,k) + 1 + msk(i,j,k-1) + msk(i,j,k+1)) - end if - end do - end do - - ! z-valid, y-lo, x-hi - i = hi(1)+1 - j = lo(2)-1 - do k = lo(3), hi(3) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i,j+1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - else - u(i,j,k,n) = ( u(i-1,j+1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n) & - + msk(i,j+1,k+1) * u(i,j+1,k+1,n)) / & - (1 + msk(i-1,j,k-1) + msk(i-1,j,k+1) + msk(i,j+1,k-1) + msk(i,j+1,k+1)) - end if - end if - end do - - ! z-valid, y-valid, x-lo - i = lo(1)-1 - do k = lo(3), hi(3) - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = ( u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) / & - (1 + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - end if - end do - end do - - ! z-valid, y-valid, x-hi - i = hi(1)+1 - do k = lo(3), hi(3) - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = ( u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) / & - (1 + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - end if - end do - end do - - ! z-valid, y-hi, x-lo - i = lo(1)-1 - j = hi(2)+1 - do k = lo(3), hi(3) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - else - u(i,j,k,n) = ( u(i+1,j-1,k,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + msk(i+1,j,k+1) * u(i+1,j,k+1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n) & - + msk(i,j-1,k+1) * u(i,j-1,k+1,n)) & - / (1 + msk(i+1,j,k-1) + msk(i+1,j,k+1) + msk(i,j-1,k-1) + msk(i,j-1,k+1)) - end if - end if - end do - - ! z-valid, y-hi, x-valid - j = hi(2)+1 - do k = lo(3), hi(3) - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) / & - (msk(i-1,j,k) + msk(i+1,j,k) + 1 + msk(i,j,k-1) + msk(i,j,k+1)) - end if - end do - end do - - ! z-valid, y-hi, x-hi - i = hi(1)+1 - j = hi(2)+1 - do k = lo(3), hi(3) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell & - .or. msk(i,j,k+1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n) & - + msk(i,j,k+1) * u(i,j,k+1,n)) & - / (msk(i-1,j,k) + msk(i,j-1,k) + msk(i,j,k-1) + msk(i,j,k+1)) - else - u(i,j,k,n) = ( u(i-1,j-1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i-1,j,k+1) * u(i-1,j,k+1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n) & - + msk(i,j-1,k+1) * u(i,j-1,k+1,n)) / & - (1 + msk(i-1,j,k-1) + msk(i-1,j,k+1) + msk(i,j-1,k-1) + msk(i,j-1,k+1)) - end if - end if - end do - - ! z-hi, y-lo, x-lo - i = lo(1)-1 - j = lo(2)-1 - k = hi(3)+1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i+1,j,k) + msk(i,j+1,k) + msk(i,j,k-1)) - - else if ( msk(i+1,j+1,k) .eq. finecell & - .or. msk(i+1,j,k-1) .eq. finecell & - .or. msk(i,j+1,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n)) & - / (msk(i+1,j+1,k) + msk(i+1,j,k-1) + msk(i,j+1,k-1)) - else - u(i,j,k,n) = u(i+1,j+1,k-1,n) - end if - end if - - ! z-hi, y-lo, x-valid - j = lo(2)-1 - k = hi(3)+1 - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i+1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j+1,k) + msk(i,j,k-1)) - else - u(i,j,k,n) = (msk(i-1,j+1,k) * u(i-1,j+1,k,n) & - + msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + u(i,j+1,k-1,n)) / & - (msk(i-1,j+1,k) + msk(i+1,j+1,k) + msk(i-1,j,k-1) + msk(i+1,j,k-1) + 1) - end if - end if - end do - - ! z-hi, y-lo, x-hi - i = hi(1)+1 - j = lo(2)-1 - k = hi(3)+1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i-1,j,k) + msk(i,j+1,k) + msk(i,j,k-1)) - - else if ( msk(i-1,j+1,k) .eq. finecell & - .or. msk(i-1,j,k-1) .eq. finecell & - .or. msk(i,j+1,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j+1,k) * u(i-1,j+1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n)) & - / (msk(i-1,j+1,k) + msk(i-1,j,k-1) + msk(i,j+1,k-1)) - else - u(i,j,k,n) = u(i-1,j+1,k-1,n) - end if - end if - - ! z-hi, y-valid, x-lo - i = lo(1)-1 - k = hi(3)+1 - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) / & - (msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k-1)) - else - u(i,j,k,n) = (msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i+1,j+1,k) * u(i+1,j+1,k,n) & - + u(i+1,j,k-1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n)) / & - (msk(i+1,j-1,k) + msk(i+1,j+1,k) + 1 + msk(i,j-1,k-1) + msk(i,j+1,k-1)) - end if - end if - end do - - ! z-hi, y-valid, x-valid - k = hi(3)+1 - do j = lo(2), hi(2) - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + u(i,j,k-1,n)) / & - (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + 1) - end if - end do - end do - - ! z-hi, y-valid, x-hi - i = hi(1)+1 - k = hi(3)+1 - do j = lo(2), hi(2) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j+1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j+1,k) * u(i,j+1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) / & - (msk(i-1,j,k) + msk(i,j-1,k) + msk(i,j+1,k) + msk(i,j,k-1)) - else - u(i,j,k,n) = (msk(i-1,j-1,k) * u(i-1,j-1,k,n) & - + msk(i-1,j+1,k) * u(i-1,j+1,k,n) & - + u(i-1,j,k-1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n) & - + msk(i,j+1,k-1) * u(i,j+1,k-1,n)) / & - (msk(i-1,j-1,k) + msk(i-1,j+1,k) + 1 + msk(i,j-1,k-1) + msk(i,j+1,k-1)) - end if - end if - end do - - ! z-hi, y-hi, x-lo - i = lo(1)-1 - j = hi(2)+1 - k = hi(3)+1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j,k-1)) - - else if ( msk(i+1,j-1,k) .eq. finecell & - .or. msk(i+1,j,k-1) .eq. finecell & - .or. msk(i,j-1,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n)) & - / (msk(i+1,j-1,k) + msk(i+1,j,k-1) + msk(i,j-1,k-1)) - else - u(i,j,k,n) = u(i+1,j-1,k-1,n) - end if - end if - - ! z-hi, y-hi, x-valid - j = hi(2)+1 - k = hi(3)+1 - do i = lo(1), hi(1) - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i+1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i+1,j,k) * u(i+1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i-1,j,k) + msk(i+1,j,k) + msk(i,j-1,k) + msk(i,j,k-1)) - else - u(i,j,k,n) = (msk(i-1,j-1,k) * u(i-1,j-1,k,n) & - + msk(i+1,j-1,k) * u(i+1,j-1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i+1,j,k-1) * u(i+1,j,k-1,n) & - + u(i,j-1,k-1,n)) / & - (msk(i-1,j-1,k) + msk(i+1,j-1,k) + msk(i-1,j,k-1) + msk(i+1,j,k-1) + 1) - end if - end if - end do - - ! z-hi, y-hi, x-hi - i = hi(1)+1 - j = hi(2)+1 - k = hi(3)+1 - if (msk(i,j,k) .eq. crsecell) then - if ( msk(i-1,j,k) .eq. finecell & - .or. msk(i,j-1,k) .eq. finecell & - .or. msk(i,j,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j,k) * u(i-1,j,k,n) & - + msk(i,j-1,k) * u(i,j-1,k,n) & - + msk(i,j,k-1) * u(i,j,k-1,n)) & - / (msk(i-1,j,k) + msk(i,j-1,k) + msk(i,j,k-1)) - - else if ( msk(i-1,j-1,k) .eq. finecell & - .or. msk(i-1,j,k-1) .eq. finecell & - .or. msk(i,j-1,k-1) .eq. finecell) then - - u(i,j,k,n) = (msk(i-1,j-1,k) * u(i-1,j-1,k,n) & - + msk(i-1,j,k-1) * u(i-1,j,k-1,n) & - + msk(i,j-1,k-1) * u(i,j-1,k-1,n)) & - / (msk(i-1,j-1,k) + msk(i-1,j,k-1) + msk(i,j-1,k-1)) - else - u(i,j,k,n) = u(i-1,j-1,k-1,n) - end if - end if - end do - - end subroutine amrex_first_order_extrap - -end module amrex_extrapolater diff --git a/Src/Amr/AMReX_extrapolater_K.H b/Src/Amr/AMReX_extrapolater_K.H new file mode 100644 index 00000000000..2552bb069c5 --- /dev/null +++ b/Src/Amr/AMReX_extrapolater_K.H @@ -0,0 +1,16 @@ +#ifndef AMReX_extrapolater_K_H_ +#define AMReX_extrapolater_K_H_ + +#include +#include +#include + +#if (AMREX_SPACEDIM == 1) +#include +#elif (AMREX_SPACEDIM == 2) +#include +#else +#include +#endif + +#endif diff --git a/Src/Amr/CMakeLists.txt b/Src/Amr/CMakeLists.txt index 78ac127f65f..b5d16891dde 100644 --- a/Src/Amr/CMakeLists.txt +++ b/Src/Amr/CMakeLists.txt @@ -16,12 +16,8 @@ target_sources(amrex AMReX_AuxBoundaryData.H AMReX_StateDescriptor.cpp AMReX_AuxBoundaryData.cpp + AMReX_Extrapolater.H + AMReX_Extrapolater.cpp + AMReX_extrapolater_K.H + AMReX_extrapolater_${AMReX_SPACEDIM}D_K.H ) - -if (ENABLE_FORTRAN) - target_sources(amrex - PRIVATE - AMReX_Extrapolater.H - AMReX_Extrapolater.cpp - AMReX_extrapolater_${DIM}d.f90 ) -endif () diff --git a/Src/Amr/Make.package b/Src/Amr/Make.package index 42595197e4d..12a0cf0b8c4 100644 --- a/Src/Amr/Make.package +++ b/Src/Amr/Make.package @@ -1,17 +1,11 @@ AMRLIB_BASE=EXE -C$(AMRLIB_BASE)_sources += AMReX_Amr.cpp AMReX_AmrLevel.cpp AMReX_AsyncFillPatch.cpp AMReX_Derive.cpp AMReX_StateData.cpp \ - AMReX_StateDescriptor.cpp AMReX_AuxBoundaryData.cpp +C$(AMRLIB_BASE)_sources += AMReX_Amr.cpp AMReX_AmrLevel.cpp AMReX_Derive.cpp AMReX_StateData.cpp \ + AMReX_StateDescriptor.cpp AMReX_AuxBoundaryData.cpp AMReX_Extrapolater.cpp C$(AMRLIB_BASE)_headers += AMReX_Amr.H AMReX_AmrLevel.H AMReX_Derive.H AMReX_LevelBld.H AMReX_StateData.H \ - AMReX_StateDescriptor.H AMReX_PROB_AMR_F.H AMReX_AuxBoundaryData.H - -ifneq ($(BL_NO_FORT),TRUE) - f90$(AMRLIB_BASE)_sources += AMReX_extrapolater_$(DIM)d.f90 - C$(AMRLIB_BASE)_sources += AMReX_Extrapolater.cpp - C$(AMRLIB_BASE)_headers += AMReX_Extrapolater.H -endif + AMReX_StateDescriptor.H AMReX_PROB_AMR_F.H AMReX_AuxBoundaryData.H AMReX_Extrapolater.H AMReX_extrapolater_K.H AMReX_extrapolater_$(DIM)D_K.H VPATH_LOCATIONS += $(AMREX_HOME)/Src/Amr INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Amr diff --git a/Src/AmrCore/AMReX_AmrCore.cpp b/Src/AmrCore/AMReX_AmrCore.cpp index 5e3fca57267..f94142a0e68 100644 --- a/Src/AmrCore/AMReX_AmrCore.cpp +++ b/Src/AmrCore/AMReX_AmrCore.cpp @@ -127,7 +127,7 @@ AmrCore::printGridSummary (std::ostream& os, int min_lev, int max_lev) const noe int numgrid = bs.size(); Long ncells = bs.numPts(); double ntot = Geom(lev).Domain().d_numPts(); - Real frac = 100.0_rt*(Real(ncells) / ntot); + Real frac = Real(100.0)*(Real(ncells) / ntot); os << " Level " << lev diff --git a/Src/AmrCore/AMReX_AmrMesh.H b/Src/AmrCore/AMReX_AmrMesh.H index c79a0ebe07a..65b5cad8f76 100644 --- a/Src/AmrCore/AMReX_AmrMesh.H +++ b/Src/AmrCore/AMReX_AmrMesh.H @@ -249,6 +249,7 @@ protected: Vector grids; unsigned int num_setdm = 0; + unsigned int num_setba = 0; void checkInput(); diff --git a/Src/AmrCore/AMReX_AmrMesh.cpp b/Src/AmrCore/AMReX_AmrMesh.cpp index ff9b290c1d3..1aeda59d833 100644 --- a/Src/AmrCore/AMReX_AmrMesh.cpp +++ b/Src/AmrCore/AMReX_AmrMesh.cpp @@ -380,6 +380,7 @@ AmrMesh::SetDistributionMap (int lev, const DistributionMapping& dmap_in) noexce void AmrMesh::SetBoxArray (int lev, const BoxArray& ba_in) noexcept { + ++num_setba; if (grids[lev] != ba_in) grids[lev] = ba_in; } @@ -502,17 +503,16 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& Vector p_n(max_level); // Proper nesting domain. Vector p_n_comp(max_level); // Complement proper nesting domain. - BoxList bl(grids[lbase]); - bl.simplify(); + BoxList bl = grids[lbase].simplified_list(); bl.coarsen(bf_lev[lbase]); - p_n_comp[lbase].complementIn(pc_domain[lbase],bl); + p_n_comp[lbase].parallelComplementIn(pc_domain[lbase],bl); p_n_comp[lbase].simplify(); p_n_comp[lbase].accrete(n_proper); if (geom[lbase].isAnyPeriodic()) { ProjPeriodic(p_n_comp[lbase], pc_domain[lbase], geom[lbase].isPeriodic()); } - p_n[lbase].complementIn(pc_domain[lbase],p_n_comp[lbase]); + p_n[lbase].parallelComplementIn(pc_domain[lbase],p_n_comp[lbase]); p_n[lbase].simplify(); bl.clear(); @@ -530,7 +530,7 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& ProjPeriodic(p_n_comp[i], pc_domain[i], geom[i].isPeriodic()); } - p_n[i].complementIn(pc_domain[i],p_n_comp[i]); + p_n[i].parallelComplementIn(pc_domain[i],p_n_comp[i]); p_n[i].simplify(); } @@ -550,12 +550,12 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& if (levf < new_finest) { - BoxArray ba_proj(new_grids[levf+1]); + BoxArray ba_proj = new_grids[levf+1].simplified(); ba_proj.coarsen(ref_ratio[levf]); ba_proj.growcoarsen(n_proper, ref_ratio[levc]); - BoxArray levcBA = grids[levc]; + BoxArray levcBA = grids[levc].simplified(); while (!levcBA.contains(ba_proj)) { @@ -594,8 +594,7 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& // Replace this by n_error_buf that may be anisotropic // int nerr = n_error_buf[levf]; - BoxList bl_tagged(new_grids[levf+1]); - bl_tagged.simplify(); + BoxList bl_tagged = new_grids[levf+1].simplified_list(); bl_tagged.coarsen(ref_ratio[levf]); // // This grows the boxes by n_error_buf[levf][idir] if they touch the edge @@ -617,7 +616,7 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& } Box mboxF = amrex::grow(bl_tagged.minimalBox(),1); BoxList blFcomp; - blFcomp.complementIn(mboxF,bl_tagged); + blFcomp.parallelComplementIn(mboxF,bl_tagged); blFcomp.simplify(); bl_tagged.clear(); @@ -626,9 +625,8 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& n_error_buf[levf][2]/ref_ratio[levf][2])); blFcomp.accrete(iv); BoxList blF; - blF.complementIn(mboxF,blFcomp); - BoxArray baF(blF); - blF.clear(); + blF.parallelComplementIn(mboxF,blFcomp); + BoxArray baF(std::move(blF)); baF.grow(n_proper); // // We need to do this in case the error buffering at @@ -682,10 +680,10 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& // // Map tagged points through periodic boundaries, if any. // - tags.mapPeriodic(Geometry(pc_domain[levc], - Geom(levc).ProbDomain(), - Geom(levc).CoordInt(), - Geom(levc).isPeriodic())); + tags.mapPeriodicRemoveDuplicates(Geometry(pc_domain[levc], + Geom(levc).ProbDomain(), + Geom(levc).CoordInt(), + Geom(levc).isPeriodic())); // // Remove cells outside proper nesting domain for this level. // @@ -704,62 +702,68 @@ AmrMesh::MakeNewGrids (int lbase, Real time, int& new_finest, Vector& // if ( !(useFixedCoarseGrids() && levc0) { - if ( !(Geom(levc).Domain().contains(BoxArray(new_bx).minimalBox())) ) { - // Chop new grids outside domain, note that this is likely to result in - // new grids that violate blocking_factor....see warning checking below - new_bx = amrex::intersect(new_bx,Geom(levc).Domain()); - } - } - const IntVect& largest_grid_size = max_grid_size[levf] / ref_ratio[levc]; - // - // Ensure new grid boxes are at most max_grid_size in index dirs. - // - new_bx.maxSize(largest_grid_size); + if (levf > useFixedUpToLevel()) { + BoxList new_bx; + if (ParallelDescriptor::IOProcessor()) { + BL_PROFILE("AmrMesh-cluster"); + // + // Construct initial cluster. + // + ClusterList clist(&tagvec[0], tagvec.size()); + if (use_new_chop) { + clist.new_chop(grid_eff); + } else { + clist.chop(grid_eff); + } + BoxDomain bd; + bd.add(p_n[levc]); + clist.intersect(bd); + bd.clear(); + // + // Efficient properly nested Clusters have been constructed + // now generate list of grids at level levf. + // + clist.boxList(new_bx); + new_bx.refine(bf_lev[levc]); + new_bx.simplify(); + + if (new_bx.size()>0) { + // Chop new grids outside domain + new_bx.intersect(Geom(levc).Domain()); + } + } + new_bx.Bcast(); // Broadcast the new BoxList to other processes - // - // Refine up to levf. - // - new_bx.refine(ref_ratio[levc]); - BL_ASSERT(new_bx.isDisjoint()); + // + // Refine up to levf. + // + new_bx.refine(ref_ratio[levc]); + BL_ASSERT(new_bx.isDisjoint()); - if (new_bx.size()>0) { - if ( !(Geom(levf).Domain().contains(BoxArray(new_bx).minimalBox())) ) { - new_bx = amrex::intersect(new_bx,Geom(levf).Domain()); - } + new_grids[levf] = BoxArray(std::move(new_bx), max_grid_size[levf]); } + } + } - if(levf > useFixedUpToLevel()) { - new_grids[levf].define(new_bx); - } +#ifdef AMREX_DEBUG + if (!useFixedCoarseGrids()) { + // check proper nesting + for (int lev = lbase+1; lev <= new_finest; ++lev) { + BoxArray const& cba = (lev == lbase+1) ? grids[lev-1] : new_grids[lev-1]; + BoxArray const& fba = amrex::coarsen(new_grids[lev],ref_ratio[lev-1]); + IntVect np = bf_lev[lev-1] * n_proper; + Box const& cdomain = Geom(lev-1).Domain(); + for (int i = 0, N = fba.size(); i < N; ++i) { + Box const& fb = amrex::grow(fba[i],np) & cdomain; + if (!cba.contains(fb,true)) { + amrex::Abort("AmrMesh::MakeNewGrids: new grids not properly nested"); + } + } } } +#endif for (int lev = lbase+1; lev <= new_finest; ++lev) { if (new_grids[lev].empty()) @@ -788,10 +792,13 @@ AmrMesh::MakeNewGrids (Real time) const BoxArray& ba = MakeBaseGrids(); DistributionMapping dm(ba); const auto old_num_setdm = num_setdm; + const auto old_num_setba = num_setba; MakeNewLevelFromScratch(0, time, ba, dm); - SetBoxArray(0, ba); + if (old_num_setba == num_setba) { + SetBoxArray(0, ba); + } if (old_num_setdm == num_setdm) { SetDistributionMap(0, dm); } @@ -980,6 +987,22 @@ AmrMesh::checkInput () } } + // Make sure TagBoxArray has no overlapped valid cells after coarsening by block_factor/ref_ratio + for (int i = 0; i < max_level; ++i) { + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + int bf_lev = std::max(1,blocking_factor[i+1][idim]/ref_ratio[i][idim]); + int min_grid_size = std::min(blocking_factor[i][idim],max_grid_size[i][idim]); + if (min_grid_size % bf_lev != 0) { + amrex::Print() << "On level " << i << " in direction " << idim + << " max_grid_size is " << max_grid_size[i][idim] + << " blocking factor is " << blocking_factor[i][idim] << "\n" + << "On level " << i+1 << " in direction " << idim + << " blocking_factor is " << blocking_factor[i+1][idim] << std::endl; + amrex::Error("Coarse level blocking factor not a multiple of fine level blocking factor divided by ref ratio"); + } + } + } + if( ! (Geom(0).ProbDomain().volume() > 0.0) ) { amrex::Error("Amr::checkInput: bad physical problem size"); } diff --git a/Src/AmrCore/AMReX_AmrParGDB.H b/Src/AmrCore/AMReX_AmrParGDB.H index 1c189d45a95..2d7093a2bd5 100644 --- a/Src/AmrCore/AMReX_AmrParGDB.H +++ b/Src/AmrCore/AMReX_AmrParGDB.H @@ -13,25 +13,39 @@ public: explicit AmrParGDB (AmrCore* amr) noexcept : m_amrcore(amr), + m_geom(amr->maxLevel()+1), + m_has_geom(amr->maxLevel()+1, 0), m_dmap(amr->maxLevel()+1), m_ba(amr->maxLevel()+1) { } virtual ~AmrParGDB () {;} - - virtual const Geometry& Geom (int level) const override; - virtual const DistributionMapping& ParticleDistributionMap - (int level) const override; - virtual const DistributionMapping& DistributionMap - (int level) const override; + + virtual const Geometry& ParticleGeom (int level) const override; + virtual const Geometry& Geom (int level) const override; + + virtual const Vector& ParticleGeom () const override; + virtual const Vector& Geom () const override; + + virtual const DistributionMapping& ParticleDistributionMap (int level) const override; + virtual const DistributionMapping& DistributionMap (int level) const override; + + virtual const Vector& ParticleDistributionMap () const override; + virtual const Vector& DistributionMap () const override; + virtual const BoxArray& ParticleBoxArray (int level) const override; virtual const BoxArray& boxArray (int level) const override; + virtual const Vector& ParticleBoxArray () const override; + virtual const Vector& boxArray () const override; + virtual void SetParticleBoxArray (int level, const BoxArray& new_ba) override; virtual void SetParticleDistributionMap (int level, const DistributionMapping& new_dm) override; + virtual void SetParticleGeometry (int level, const Geometry& new_geom) override; virtual void ClearParticleBoxArray (int level) override; virtual void ClearParticleDistributionMap (int level) override; + virtual void ClearParticleGeometry (int level) override; virtual bool LevelDefined (int level) const override; virtual int finestLevel () const override; @@ -40,21 +54,55 @@ public: virtual IntVect refRatio (int level) const override; virtual int MaxRefRatio (int level) const override; + virtual Vector refRatio () const override; + protected: - AmrCore* m_amrcore; + + AmrCore* m_amrcore; + Vector m_geom; + Vector m_has_geom; Vector m_dmap; Vector m_ba; }; -inline -const Geometry& +inline +const Geometry& +AmrParGDB::ParticleGeom (int level) const +{ + if (not m_has_geom[level]) { + return m_amrcore->Geom(level); + } else { + return m_geom[level]; + } +} + +inline +const Geometry& AmrParGDB::Geom (int level) const { return m_amrcore->Geom(level); } -inline -const DistributionMapping& +inline +const Vector& +AmrParGDB::ParticleGeom () const +{ + if (not m_has_geom[0]) { + return m_amrcore->Geom(); + } else { + return m_geom; + } +} + +inline +const Vector& +AmrParGDB::Geom () const +{ + return m_amrcore->Geom(); +} + +inline +const DistributionMapping& AmrParGDB::ParticleDistributionMap (int level) const { if (m_dmap[level].empty()) { @@ -64,14 +112,32 @@ AmrParGDB::ParticleDistributionMap (int level) const } } -inline -const DistributionMapping& +inline +const DistributionMapping& AmrParGDB::DistributionMap (int level) const { return m_amrcore->DistributionMap(level); } -inline +inline +const Vector& +AmrParGDB::ParticleDistributionMap () const +{ + if (m_dmap[0].empty()) { + return m_amrcore->DistributionMap(); + } else { + return m_dmap; + } +} + +inline +const Vector& +AmrParGDB::DistributionMap () const +{ + return m_amrcore->DistributionMap(); +} + +inline const BoxArray& AmrParGDB::ParticleBoxArray (int level) const { @@ -89,6 +155,24 @@ AmrParGDB::boxArray (int level) const return m_amrcore->boxArray(level); } +inline +const Vector& +AmrParGDB::ParticleBoxArray () const +{ + if (m_ba[0].empty()) { + return m_amrcore->boxArray(); + } else { + return m_ba; + } +} + +inline +const Vector& +AmrParGDB::boxArray () const +{ + return m_amrcore->boxArray(); +} + inline void AmrParGDB::SetParticleBoxArray (int level, const BoxArray& new_ba) { @@ -101,6 +185,13 @@ void AmrParGDB::SetParticleDistributionMap (int level, const DistributionMapping m_dmap[level] = new_dmap; } +inline +void AmrParGDB::SetParticleGeometry (int level, const Geometry& new_geom) +{ + m_has_geom[level] = 1; + m_geom[level] = new_geom; +} + inline void AmrParGDB::ClearParticleBoxArray (int level) { @@ -113,36 +204,50 @@ void AmrParGDB::ClearParticleDistributionMap (int level) m_dmap[level] = DistributionMapping(); } -inline -bool +inline +void AmrParGDB::ClearParticleGeometry (int level) +{ + m_geom[level] = Geometry(); + m_has_geom[level] = 0; +} + +inline +bool AmrParGDB::LevelDefined (int level) const { return m_amrcore->LevelDefined(level); } -inline -int +inline +int AmrParGDB::finestLevel () const { return m_amrcore->finestLevel(); } -inline -int +inline +int AmrParGDB::maxLevel () const { return m_amrcore->maxLevel(); } - -inline -IntVect + +inline +IntVect AmrParGDB::refRatio (int level) const { return m_amrcore->refRatio(level); } -inline -int +inline +Vector +AmrParGDB::refRatio () const +{ + return m_amrcore->refRatio(); +} + +inline +int AmrParGDB::MaxRefRatio (int level) const { return m_amrcore->MaxRefRatio(level); diff --git a/Src/AmrCore/AMReX_Cluster.cpp b/Src/AmrCore/AMReX_Cluster.cpp index 10030d9995f..fb8bf807cb9 100644 --- a/Src/AmrCore/AMReX_Cluster.cpp +++ b/Src/AmrCore/AMReX_Cluster.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace amrex { @@ -34,6 +35,8 @@ class InBox public: explicit InBox (const Box& b) noexcept : m_box(b) {} + // You might see compiler warning on this is never referenced. + // The compiler is wrong. bool operator() (const IntVect& iv) const noexcept { return m_box.contains(iv); @@ -246,6 +249,8 @@ class Cut public: Cut (const IntVect& cut, int dir) : m_cut(cut), m_dir(dir) {} + // You might see compiler warning on this is never referenced. + // The compiler is wrong. bool operator() (const IntVect& iv) const { return iv[m_dir] < m_cut[m_dir]; @@ -522,6 +527,7 @@ ClusterList::boxList (BoxList& blst) const void ClusterList::chop (Real eff) { + BL_PROFILE("ClusterList::chop()"); for (std::list::iterator cli = lst.begin(); cli != lst.end(); ) { @@ -539,6 +545,7 @@ ClusterList::chop (Real eff) void ClusterList::new_chop (Real eff) { + BL_PROFILE("ClusterList::new_chop()"); for (std::list::iterator cli = lst.begin(); cli != lst.end(); ) { @@ -556,6 +563,8 @@ ClusterList::new_chop (Real eff) void ClusterList::intersect (const BoxDomain& dom) { + BL_PROFILE("ClusterList::intersect()"); + // // Make a BoxArray covering dom. // We'll use this to speed up the contains() test below. diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H index 9bdbea056bd..2beae107bb0 100644 --- a/Src/AmrCore/AMReX_ErrorList.H +++ b/Src/AmrCore/AMReX_ErrorList.H @@ -8,9 +8,13 @@ #include #include #include +#include +#include +#include namespace amrex { + extern "C" { @@ -372,6 +376,110 @@ private: std::ostream& operator << (std::ostream& os, const ErrorList& elst); + struct AMRErrorTagInfo + { + int m_max_level = 1000; + Real m_min_time = std::numeric_limits::lowest(); + Real m_max_time = std::numeric_limits::max(); + RealBox m_realbox; + + AMRErrorTagInfo& SetMaxLevel (int max_level) noexcept { + m_max_level = max_level; + return *this; + } + AMRErrorTagInfo& SetMinTime (amrex::Real min_time) noexcept { + m_min_time = min_time; + return *this; + } + AMRErrorTagInfo& SetMaxTime (amrex::Real max_time) noexcept { + m_max_time = max_time; + return *this; + } + AMRErrorTagInfo& SetRealBox (const amrex::RealBox& realbox) noexcept { + m_realbox = realbox; + return *this; + } + }; + + class AMRErrorTag + { + public: + + enum TEST {GRAD=0, LESS, GREATER, VORT, BOX, USER}; + + struct UserFunc + { + virtual void operator() (const amrex::Box& bx, + amrex::Array4 const& dat, + amrex::Array4 const& tag, + amrex::Real time, + int level, + char tagval, + char clearval) = 0; + }; + + explicit AMRErrorTag (const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept + : m_test(BOX), m_field(std::string()), m_info(info) {m_ngrow = SetNGrow();} + + AMRErrorTag (amrex::Real value, + AMRErrorTag::TEST test, + const std::string& field, + const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept + : m_test(test), m_field(field), m_info(info) + { + m_value.resize(info.m_max_level); + for (int i = 0; i < m_value.size(); ++i) { + m_value[i] = value; + } + m_ngrow = SetNGrow(); + } + + AMRErrorTag (amrex::Vector value, + AMRErrorTag::TEST test, + const std::string& field, + const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept + : m_test(test), m_field(field), m_info(info) + { + AMREX_ASSERT(value.size() > 0); + m_value.resize(info.m_max_level); + for (int i = 0; i < m_value.size() && i < value.size(); ++i) { + m_value[i] = value[i]; + } + // If the user didn't provided a value for every level, + // assume the last value holds for all higher levels. + for (int i = value.size(); i < m_value.size(); ++i) { + m_value[i] = value[value.size()-1]; + } + m_ngrow = SetNGrow(); + } + + AMRErrorTag (AMRErrorTag::UserFunc* userfunc, + const std::string& field, + int ngrow, + const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept + : m_userfunc(userfunc), m_field(field), m_info(info), m_ngrow(ngrow) {} + + virtual void operator() (amrex::TagBoxArray& tb, + const amrex::MultiFab* mf, + char clearval, + char tagval, + amrex::Real time, + int level, + const amrex::Geometry& geom) const noexcept; + + int NGrow() const noexcept {return m_ngrow;} + const std::string& Field () const noexcept {return m_field;} + + protected: + int SetNGrow () const noexcept; + + Vector m_value; + TEST m_test; + UserFunc* m_userfunc = nullptr; + std::string m_field; + AMRErrorTagInfo m_info; + int m_ngrow; + }; } #endif diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp index a41bc1a199f..ae1e1543faf 100644 --- a/Src/AmrCore/AMReX_ErrorList.cpp +++ b/Src/AmrCore/AMReX_ErrorList.cpp @@ -228,4 +228,194 @@ operator << (std::ostream& os, return os; } + static + void + AMRErrorTag_GRAD(const Box& bx, + Array4 const& dat, + Array4 const& tag, + Real threshold, + char tagval) + { + amrex::ParallelFor(bx, + [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) noexcept + { + auto ax = amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k)); + ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k))); +#if AMREX_SPACEDIM == 1 + if (ax >= threshold) tag(i,j,k) = tagval; +#else + auto ay = amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k)); + ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k))); +#if AMREX_SPACEDIM > 2 + auto az = amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k)); + az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1))); +#endif + if (amrex::max(AMREX_D_DECL(ax,ay,az)) >= threshold) { + tag(i,j,k) = tagval; + } +#endif + }); + } + + int + AMRErrorTag::SetNGrow () const noexcept + { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_test != USER, "Do not call SetNGrow with USER test"); + static std::map ng = { {GRAD,1}, {LESS,0}, {GREATER,0}, {VORT,0}, {BOX,0} }; + return ng[m_test]; + } + + static + void + AMRErrorTag_LESS(const Box& bx, + Array4 const& dat, + Array4 const& tag, + Real threshold, + char tagval) noexcept + { + amrex::ParallelFor(bx, + [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) noexcept + { + if (dat(i,j,k) <= threshold) { + tag(i,j,k) = tagval; + } + }); + } + + static + void + AMRErrorTag_GREATER(const Box& bx, + Array4 const& dat, + Array4 const& tag, + Real threshold, + char tagval) noexcept + { + amrex::ParallelFor(bx, + [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) noexcept + { + if (dat(i,j,k) >= threshold) { + tag(i,j,k) = tagval; + } + }); + } + + static + void + AMRErrorTag_BOX(const Box& bx, + Array4 const& tag, + const RealBox& tag_rb, + const Geometry& geom, + char tagval) noexcept + { + auto plo = geom.ProbLoArray(); + auto dx = geom.CellSizeArray(); + class RealBox trb(bx,dx.data(),plo.data()); + if (tag_rb.intersects(trb)) + { + amrex::ParallelFor(bx, + [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) noexcept + { + GpuArray pt = {{AMREX_D_DECL(plo[0]+(Real(i)+Real(0.5))*dx[0], + plo[1]+(Real(j)+Real(0.5))*dx[1], + plo[2]+(Real(k)+Real(0.5))*dx[2])}}; + if (tag_rb.contains(pt.data())) { + tag(i,j,k) = tagval; + } + }); + } + } + + static + void + AMRErrorTag_VORT(const Box& bx, + Array4 const& dat, + Array4 const& tag, + int level, + Real threshold, + char tagval) noexcept + { + const Real fac = threshold * std::pow(2,level); + amrex::ParallelFor(bx, + [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) noexcept + { + if (dat(i,j,k) >= fac) { + tag(i,j,k) = tagval; + } + }); + } + + void + AMRErrorTag::operator() (TagBoxArray& tba, + const MultiFab* mf, + char clearval, + char tagval, + Real time, + int level, + const Geometry& geom) const noexcept + { + BL_PROFILE("AMRErrorTag::operator()"); + + if (m_test == USER) + { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_userfunc!=nullptr,"UserFunc not properly set in AMRErrorTag"); + +#ifdef _OPENMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif + for (MFIter mfi(tba,TilingIfNotGPU()); mfi.isValid(); ++mfi) + { + const auto& bx = mfi.tilebox(); + auto const& dat = mf->array(mfi); + auto tag = tba.array(mfi); + (*m_userfunc)(bx,dat,tag,time,level,tagval,clearval); + } + } + else + { + if ((level < m_info.m_max_level) && + (time >= m_info.m_min_time ) && + (time <= m_info.m_max_time ) ) + { + +#ifdef _OPENMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif + for (MFIter mfi(tba,TilingIfNotGPU()); mfi.isValid(); ++mfi) + { + const auto& bx = mfi.tilebox(); + auto tag = tba.array(mfi); + + if (m_test == BOX) + { + AMRErrorTag_BOX(bx, tag, m_info.m_realbox, geom, tagval); + } + else + { + auto const& dat = mf->array(mfi); + + if (m_test == GRAD) + { + AMRErrorTag_GRAD(bx, dat, tag, m_value[level], tagval); + } + else if (m_test == LESS) + { + AMRErrorTag_LESS(bx, dat, tag, m_value[level], tagval); + } + else if (m_test == GREATER) + { + AMRErrorTag_GREATER(bx, dat, tag, m_value[level], tagval); + } + else if (m_test == VORT) + { + AMRErrorTag_VORT(bx, dat, tag, level, m_value[level], tagval); + } + else + { + Abort("Bad AMRErrorTag test flag"); + } + } + } + } + } + } } diff --git a/Src/AmrCore/AMReX_FillPatchUtil.H b/Src/AmrCore/AMReX_FillPatchUtil.H index ccb3c47f213..751c5369f3d 100644 --- a/Src/AmrCore/AMReX_FillPatchUtil.H +++ b/Src/AmrCore/AMReX_FillPatchUtil.H @@ -26,7 +26,7 @@ namespace amrex template struct NullInterpHook { - void operator() (FAB& fab, const Box& bx, int icomp, int ncomp) const {} + void operator() (FAB& /*fab*/, const Box& /*bx*/, int /*icomp*/, int /*ncomp*/) const {} }; template diff --git a/Src/AmrCore/AMReX_FillPatchUtil.cpp b/Src/AmrCore/AMReX_FillPatchUtil.cpp index 688353b2946..0dd68cb8955 100644 --- a/Src/AmrCore/AMReX_FillPatchUtil.cpp +++ b/Src/AmrCore/AMReX_FillPatchUtil.cpp @@ -15,8 +15,8 @@ namespace amrex int ref_ratio) { InterpCrseFineBndryEMfield(interp_type, - {AMREX_D_DECL(&crse[0],&crse[1],&crse[2])}, - {AMREX_D_DECL(&fine[0],&fine[1],&fine[2])}, + {{AMREX_D_DECL(&crse[0],&crse[1],&crse[2])}}, + {{AMREX_D_DECL(&fine[0],&fine[1],&fine[2])}}, cgeom, fgeom, ref_ratio); } diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H index e841643b401..4dad3b9965f 100644 --- a/Src/AmrCore/AMReX_FillPatchUtil_I.H +++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H @@ -164,34 +164,54 @@ namespace { int>::type = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { - MF mf_crse_patch(fpc.ba_crse_patch, fpc.dm_crse_patch, ncomp, 0, MFInfo(), + MF mf_crse_patch(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0, MFInfo(), *fpc.fact_crse_patch); return mf_crse_patch; } + template ::value, + int>::type = 0> + MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp) + { + MF mf_fine_patch(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0, MFInfo(), + *fpc.fact_fine_patch); + return mf_fine_patch; + } + template ::value, int>::type = 0> MF make_mf_crse_patch (FabArrayBase::FPinfo const& fpc, int ncomp) { - return MF(fpc.ba_crse_patch, fpc.dm_crse_patch, ncomp, 0); + return MF(fpc.ba_crse_patch, fpc.dm_patch, ncomp, 0); + } + + template ::value, + int>::type = 0> + MF make_mf_fine_patch (FabArrayBase::FPinfo const& fpc, int ncomp) + { + return MF(fpc.ba_fine_patch, fpc.dm_patch, ncomp, 0); } template ::value, int>::type = 0> - void mf_set_domain_bndry (MF &mf, Geometry const & cgeom) + void mf_set_domain_bndry (MF &mf, Geometry const & geom) { - mf.setDomainBndry(std::numeric_limits::quiet_NaN(), cgeom); + mf.setDomainBndry(std::numeric_limits::quiet_NaN(), geom); } template ::value, int>::type = 0> - void mf_set_domain_bndry (MF &mf, Geometry const & cgeom) + void mf_set_domain_bndry (MF &/*mf*/, Geometry const & /*geom*/) { // nothing } @@ -199,88 +219,73 @@ namespace { template EnableIf_t::value> FillPatchTwoLevels_doit (MF& mf, IntVect const& nghost, Real time, - const Vector& cmf, const Vector& ct, - const Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - BC& cbc, int cbccomp, + const Vector& cmf, const Vector& ct, + const Vector& fmf, const Vector& ft, + int scomp, int dcomp, int ncomp, + const Geometry& cgeom, const Geometry& fgeom, + BC& cbc, int cbccomp, BC& fbc, int fbccomp, - const IntVect& ratio, - Interp* mapper, + const IntVect& ratio, + Interp* mapper, const Vector& bcs, int bcscomp, const PreInterpHook& pre_interp, const PostInterpHook& post_interp, EB2::IndexSpace const* index_space) { - BL_PROFILE("FillPatchTwoLevels"); + BL_PROFILE("FillPatchTwoLevels"); using FAB = typename MF::FABType::value_type; - if (nghost.max() > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - const InterpolaterBoxCoarsener& coarsener = mapper->BoxCoarsener(ratio); - - Box fdomain = fgeom.Domain(); - fdomain.convert(mf.boxArray().ixType()); - Box fdomain_g(fdomain); - for (int i = 0; i < AMREX_SPACEDIM; ++i) { - if (fgeom.isPeriodic(i)) { - fdomain_g.grow(i,nghost[i]); - } - } + if (nghost.max() > 0 || mf.getBDKey() != fmf[0]->getBDKey()) + { + const InterpolaterBoxCoarsener& coarsener = mapper->BoxCoarsener(ratio); - const FabArrayBase::FPinfo& fpc = FabArrayBase::TheFPinfo(*fmf[0], mf, fdomain_g, + const FabArrayBase::FPinfo& fpc = FabArrayBase::TheFPinfo(*fmf[0], mf, nghost, coarsener, - amrex::coarsen(fgeom.Domain(),ratio), + fgeom, + cgeom, index_space); - if ( ! fpc.ba_crse_patch.empty()) - { + if ( ! fpc.ba_crse_patch.empty()) + { MF mf_crse_patch = make_mf_crse_patch(fpc, ncomp); mf_set_domain_bndry (mf_crse_patch, cgeom); - FillPatchSingleLevel(mf_crse_patch, time, cmf, ct, scomp, 0, ncomp, cgeom, cbc, cbccomp); + FillPatchSingleLevel(mf_crse_patch, time, cmf, ct, scomp, 0, ncomp, cgeom, cbc, cbccomp); + + MF mf_fine_patch = make_mf_fine_patch(fpc, ncomp); - int idummy1=0, idummy2=0; - bool cc = fpc.ba_crse_patch.ixType().cellCentered(); - ignore_unused(cc); + Box const& fdomain = amrex::convert(fgeom.Domain(),mf.ixType()); + int idummy=0; #ifdef _OPENMP + bool cc = fpc.ba_crse_patch.ixType().cellCentered(); #pragma omp parallel if (cc && Gpu::notInLaunchRegion()) #endif { Vector bcr(ncomp); - for (MFIter mfi(mf_crse_patch); mfi.isValid(); ++mfi) + for (MFIter mfi(mf_fine_patch); mfi.isValid(); ++mfi) { FAB& sfab = mf_crse_patch[mfi]; - int li = mfi.LocalIndex(); - int gi = fpc.dst_idxs[li]; - FAB& dfab = mf[gi]; - const Box& dbx = fpc.dst_boxes[li] & dfab.box(); + FAB& dfab = mf_fine_patch[mfi]; + const Box& dbx = dfab.box(); amrex::setBC(dbx,fdomain,bcscomp,0,ncomp,bcs,bcr); pre_interp(sfab, sfab.box(), 0, ncomp); - mapper->interp(sfab, - 0, - dfab, - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Gpu); - - post_interp(dfab, dbx, dcomp, ncomp); + mapper->interp(sfab, 0, dfab, 0, ncomp, dbx, ratio, + cgeom, fgeom, bcr, dcomp, idummy, RunOn::Gpu); + + post_interp(dfab, dbx, 0, ncomp); } } + + mf.ParallelCopy(mf_fine_patch, 0, dcomp, ncomp, IntVect{0}, nghost); } } - FillPatchSingleLevel(mf, nghost, time, fmf, ft, scomp, dcomp, ncomp, + FillPatchSingleLevel(mf, nghost, time, fmf, ft, scomp, dcomp, ncomp, fgeom, fbc, fbccomp); } } @@ -445,12 +450,16 @@ InterpFromCoarseLevel (MF& mf, IntVect const& nghost, Real time, } } + MF mf_crse_patch; #ifdef AMREX_USE_EB - auto factory = makeEBFabFactory(cgeom, ba_crse_patch, dm, {0,0,0}, EBSupport::basic); - MF mf_crse_patch(ba_crse_patch, dm, ncomp, 0, MFInfo(), *factory); -#else - MF mf_crse_patch(ba_crse_patch, dm, ncomp, 0); + if (EB2::TopIndexSpaceIfPresent()) { + auto factory = makeEBFabFactory(cgeom, ba_crse_patch, dm, {0,0,0}, EBSupport::basic); + mf_crse_patch.define(ba_crse_patch, dm, ncomp, 0, MFInfo(), *factory); + } else #endif + { + mf_crse_patch.define(ba_crse_patch, dm, ncomp, 0); + } mf_set_domain_bndry (mf_crse_patch, cgeom); mf_crse_patch.copy(cmf, scomp, 0, ncomp, cgeom.periodicity()); diff --git a/Src/AmrCore/AMReX_FluxRegister.cpp b/Src/AmrCore/AMReX_FluxRegister.cpp index af6053878dd..5156db8b1dd 100644 --- a/Src/AmrCore/AMReX_FluxRegister.cpp +++ b/Src/AmrCore/AMReX_FluxRegister.cpp @@ -455,7 +455,7 @@ FluxRegister::FineSetVal (int dir, int destcomp, int numcomp, Real val, - RunOn runon) noexcept + RunOn /*runon*/) noexcept { Gpu::LaunchSafeGuard lsg(false); // xxxxx gpu todo diff --git a/Src/AmrCore/AMReX_Interp_1D_C.H b/Src/AmrCore/AMReX_Interp_1D_C.H index 21420e79715..5ba215ace6d 100644 --- a/Src/AmrCore/AMReX_Interp_1D_C.H +++ b/Src/AmrCore/AMReX_Interp_1D_C.H @@ -33,8 +33,8 @@ ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& cge const int ic = amrex::coarsen(i, ratio[0]); const int ii = i - flo.x; const int iic = ic - clo.x; - const Real fcen = 0.5_rt*(fvc[ii ]+fvc[ii +1]); - const Real ccen = 0.5_rt*(cvc[iic]+cvc[iic+1]); + const Real fcen = Real(0.5)*(fvc[ii ]+fvc[ii +1]); + const Real ccen = Real(0.5)*(cvc[iic]+cvc[iic+1]); xoff[ii] = (fcen-ccen)/(cvc[iic+1]-cvc[iic]); } @@ -51,17 +51,17 @@ compute_slopes (const Dim3& lo, const Dim3& hi, { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,0,0,ns) = 0.5_rt*(u(i+1,0,0,nu)-u(i-1,0,0,nu)); + slopes(i,0,0,ns) = Real(0.5)*(u(i+1,0,0,nu)-u(i-1,0,0,nu)); } if (lo.x == slo.x && (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::hoextrap)) { const int i = slo.x; if (shi.x-slo.x >= 1) { - slopes(i,0,0,ns) = -(16._rt/15._rt)*u(i-1,0,0,nu) + 0.5_rt*u(i,0,0,nu) - + (2._rt/3._rt)*u(i+1,0,0,nu) - 0.1_rt*u(i+2,0,0,nu); + slopes(i,0,0,ns) = -Real(16./15.)*u(i-1,0,0,nu) + Real(0.5)*u(i,0,0,nu) + + Real(2./3.)*u(i+1,0,0,nu) - Real(0.1)*u(i+2,0,0,nu); } else { - slopes(i,0,0,ns) = 0.25_rt*(u(i+1,0,0,nu)+5._rt*u(i,0,0,nu)-6._rt*u(i-1,0,0,nu)); + slopes(i,0,0,ns) = Real(0.25)*(u(i+1,0,0,nu)+Real(5.)*u(i,0,0,nu)-Real(6.)*u(i-1,0,0,nu)); } } @@ -69,10 +69,10 @@ compute_slopes (const Dim3& lo, const Dim3& hi, { const int i = shi.x; if (shi.x-slo.x >= 1) { - slopes(i,0,0,ns) = (16._rt/15._rt)*u(i+1,0,0,nu) - 0.5_rt*u(i,0,0,nu) - - (2._rt/3._rt)*u(i-1,0,0,nu) + 0.1_rt*u(i-2,0,0,nu); + slopes(i,0,0,ns) = Real(16./15.)*u(i+1,0,0,nu) - Real(0.5)*u(i,0,0,nu) + - Real(2./3.)*u(i-1,0,0,nu) + Real(0.1)*u(i-2,0,0,nu); } else { - slopes(i,0,0,ns) = -0.25_rt*(u(i-1,0,0,nu)+5._rt*u(i,0,0,nu)-6._rt*u(i+1,0,0,nu)); + slopes(i,0,0,ns) = -Real(0.25)*(u(i-1,0,0,nu)+Real(5.)*u(i,0,0,nu)-Real(6.)*u(i+1,0,0,nu)); } } } @@ -94,7 +94,7 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - sf(i,0,0) = 1.0_rt; + sf(i,0,0) = Real(1.); } for (int n = 0; n < ncomp; ++n) @@ -105,14 +105,14 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,0,0,n); - Real forw = 2.0_rt*(u(i+1,0,0,nu)-u(i ,0,0,nu)); - Real back = 2.0_rt*(u(i ,0,0,nu)-u(i-1,0,0,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,0,0,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + Real forw = Real(2.)*(u(i+1,0,0,nu)-u(i ,0,0,nu)); + Real back = Real(2.)*(u(i ,0,0,nu)-u(i-1,0,0,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,0,0,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,0,0) = amrex::min(sf(i,0,0), slopes(i,0,0,n)/cen); } else { - sf(i,0,0) = 0.0_rt; + sf(i,0,0) = Real(0.); } } } @@ -181,10 +181,10 @@ cellconslin_slopes_mclim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,0,0,n); - Real forw = 2.0_rt*(u(i+1,0,0,nu)-u(i ,0,0,nu)); - Real back = 2.0_rt*(u(i ,0,0,nu)-u(i-1,0,0,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,0,0,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + Real forw = Real(2.)*(u(i+1,0,0,nu)-u(i ,0,0,nu)); + Real back = Real(2.)*(u(i ,0,0,nu)-u(i-1,0,0,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,0,0,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); } } } @@ -208,12 +208,12 @@ cellconslin_fine_alpha (Box const& bx, Array4 const& alpha, const int ic = amrex::coarsen(i,ratio[0]); const Real dummy_fine = xoff[i-vlo.x]*slopes(ic,0,0,n); - if (dummy_fine > mm(ic,0,0,n+ncomp) && dummy_fine != 0.0_rt) { + if (dummy_fine > mm(ic,0,0,n+ncomp) && dummy_fine != Real(0.)) { alpha(i,0,0,n) = mm(ic,0,0,n+ncomp) / dummy_fine; - } else if (dummy_fine < mm(ic,0,0,n) && dummy_fine != 0.0_rt) { + } else if (dummy_fine < mm(ic,0,0,n) && dummy_fine != Real(0.)) { alpha(i,0,0,n) = mm(ic,0,0,n) / dummy_fine; } else { - alpha(i,0,0,n) = 1.0_rt; + alpha(i,0,0,n) = Real(1.); } } } @@ -230,7 +230,7 @@ cellconslin_slopes_mmlim (Box const& bx, Array4 const& slopes, for (int n = 0; n < ncomp; ++n) { for (int i = lo.x; i <= hi.x; ++i) { const int ii = i*ratio[0]; - Real a = 1.0_rt; + Real a = Real(1.); for (int ioff = 0; ioff < ratio[0]; ++ioff) { a = amrex::min(a, alpha(ii+ioff,0,0,n)); } @@ -265,7 +265,7 @@ nodebilin_slopes (Box const& bx, Array4 const& slope, Array4 const& const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); - const Real rx = 1.0_rt/ratio[0]; + const Real rx = Real(1.)/ratio[0]; for (int n = 0; n < ncomp; ++n) { AMREX_PRAGMA_SIMD @@ -296,6 +296,16 @@ nodebilin_interp (Box const& bx, Array4 const& fine, const int fcomp, const i } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_x (int i, int /*j*/, int /*k*/, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + Real const w = static_cast(i-ii*ratio[0]) * (Real(1.)/ratio[0]); + fine(i,0,0,n) = (Real(1.)-w) * crse(ii,0,0,n) + w * crse(ii+1,0,0,n); +} + } #endif diff --git a/Src/AmrCore/AMReX_Interp_2D_C.H b/Src/AmrCore/AMReX_Interp_2D_C.H index 7d8396aa01e..57eabbb749a 100644 --- a/Src/AmrCore/AMReX_Interp_2D_C.H +++ b/Src/AmrCore/AMReX_Interp_2D_C.H @@ -33,8 +33,8 @@ ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& cge const int ic = amrex::coarsen(i, ratio[0]); const int ii = i - flo.x; const int iic = ic - clo.x; - const Real fcen = 0.5_rt*(fvc[ii ]+fvc[ii +1]); - const Real ccen = 0.5_rt*(cvc[iic]+cvc[iic+1]); + const Real fcen = Real(0.5)*(fvc[ii ]+fvc[ii +1]); + const Real ccen = Real(0.5)*(cvc[iic]+cvc[iic+1]); xoff[ii] = (fcen-ccen)/(cvc[iic+1]-cvc[iic]); } @@ -47,8 +47,8 @@ ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& cge const int jc = amrex::coarsen(j, ratio[1]); const int jj = j - flo.y; const int jjc = jc - clo.y; - const Real fcen = 0.5_rt*(fvc[jj ]+fvc[jj +1]); - const Real ccen = 0.5_rt*(cvc[jjc]+cvc[jjc+1]); + const Real fcen = Real(0.5)*(fvc[jj ]+fvc[jj +1]); + const Real ccen = Real(0.5)*(cvc[jjc]+cvc[jjc+1]); yoff[jj] = (fcen-ccen)/(cvc[jjc+1]-cvc[jjc]); } @@ -66,8 +66,8 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,0,ns ) = 0.5_rt*(u(i+1,j,0,nu)-u(i-1,j,0,nu)); - slopes(i,j,0,ns+ncomp) = 0.5_rt*(u(i,j+1,0,nu)-u(i,j-1,0,nu)); + slopes(i,j,0,ns ) = Real(0.5)*(u(i+1,j,0,nu)-u(i-1,j,0,nu)); + slopes(i,j,0,ns+ncomp) = Real(0.5)*(u(i,j+1,0,nu)-u(i,j-1,0,nu)); } } @@ -76,12 +76,12 @@ compute_slopes (const Dim3& lo, const Dim3& hi, const int i = slo.x; if (shi.x-slo.x >= 1) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,0,ns) = -(16._rt/15._rt)*u(i-1,j,0,nu) + 0.5_rt*u(i,j,0,nu) - + (2._rt/3._rt)*u(i+1,j,0,nu) - 0.1_rt*u(i+2,j,0,nu); + slopes(i,j,0,ns) = -Real(16./15.)*u(i-1,j,0,nu) + Real(0.5)*u(i,j,0,nu) + + Real(2./3.)*u(i+1,j,0,nu) - Real(0.1)*u(i+2,j,0,nu); } } else { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,0,ns) = 0.25_rt*(u(i+1,j,0,nu)+5._rt*u(i,j,0,nu)-6._rt*u(i-1,j,0,nu)); + slopes(i,j,0,ns) = Real(0.25)*(u(i+1,j,0,nu)+Real(5.)*u(i,j,0,nu)-Real(6.)*u(i-1,j,0,nu)); } } } @@ -91,12 +91,12 @@ compute_slopes (const Dim3& lo, const Dim3& hi, const int i = shi.x; if (shi.x-slo.x >= 1) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,0,ns) = (16._rt/15._rt)*u(i+1,j,0,nu) - 0.5_rt*u(i,j,0,nu) - - (2._rt/3._rt)*u(i-1,j,0,nu) + 0.1_rt*u(i-2,j,0,nu); + slopes(i,j,0,ns) = Real(16./15.)*u(i+1,j,0,nu) - Real(0.5)*u(i,j,0,nu) + - Real(2./3.)*u(i-1,j,0,nu) + Real(0.1)*u(i-2,j,0,nu); } } else { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,0,ns) = -0.25_rt*(u(i-1,j,0,nu)+5._rt*u(i,j,0,nu)-6._rt*u(i+1,j,0,nu)); + slopes(i,j,0,ns) = -Real(0.25)*(u(i-1,j,0,nu)+Real(5.)*u(i,j,0,nu)-Real(6.)*u(i+1,j,0,nu)); } } } @@ -107,13 +107,13 @@ compute_slopes (const Dim3& lo, const Dim3& hi, if (shi.y-slo.y >= 1) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,0,ns+ncomp) = -(16._rt/15._rt)*u(i,j-1,0,nu) + 0.5_rt*u(i,j,0,nu) - + (2._rt/3._rt)*u(i,j+1,0,nu) - 0.1_rt*u(i,j+2,0,nu); + slopes(i,j,0,ns+ncomp) = -Real(16./15.)*u(i,j-1,0,nu) + Real(0.5)*u(i,j,0,nu) + + Real(2./3.)*u(i,j+1,0,nu) - Real(0.1)*u(i,j+2,0,nu); } } else { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,0,ns+ncomp) = 0.25_rt*(u(i,j+1,0,nu)+5._rt*u(i,j,0,nu)-6._rt*u(i,j-1,0,nu)); + slopes(i,j,0,ns+ncomp) = Real(0.25)*(u(i,j+1,0,nu)+Real(5.)*u(i,j,0,nu)-Real(6.)*u(i,j-1,0,nu)); } } } @@ -124,13 +124,13 @@ compute_slopes (const Dim3& lo, const Dim3& hi, if (shi.y-slo.y >= 1) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,0,ns+ncomp) = (16._rt/15._rt)*u(i,j+1,0,nu) - 0.5_rt*u(i,j,0,nu) - - (2._rt/3._rt)*u(i,j-1,0,nu) + 0.1_rt*u(i,j-2,0,nu); + slopes(i,j,0,ns+ncomp) = Real(16./15.)*u(i,j+1,0,nu) - Real(0.5)*u(i,j,0,nu) + - Real(2./3.)*u(i,j-1,0,nu) + Real(0.1)*u(i,j-2,0,nu); } } else { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,0,ns+ncomp) = -0.25_rt*(u(i,j-1,0,nu)+5._rt*u(i,j,0,nu)-6._rt*u(i,j+1,0,nu)); + slopes(i,j,0,ns+ncomp) = -Real(0.25)*(u(i,j-1,0,nu)+Real(5.)*u(i,j,0,nu)-Real(6.)*u(i,j+1,0,nu)); } } } @@ -154,8 +154,8 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - sf(i,j,0,0) = 1.0_rt; - sf(i,j,0,1) = 1.0_rt; + sf(i,j,0,0) = Real(1.); + sf(i,j,0,1) = Real(1.); } } @@ -168,25 +168,25 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,j,0,n); - Real forw = 2.0_rt*(u(i+1,j,0,nu)-u(i ,j,0,nu)); - Real back = 2.0_rt*(u(i ,j,0,nu)-u(i-1,j,0,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,0,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + Real forw = Real(2.)*(u(i+1,j,0,nu)-u(i ,j,0,nu)); + Real back = Real(2.)*(u(i ,j,0,nu)-u(i-1,j,0,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,0,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,j,0,0) = amrex::min(sf(i,j,0,0), slopes(i,j,0,n)/cen); } else { - sf(i,j,0,0) = 0.0_rt; + sf(i,j,0,0) = Real(0.); } cen = slopes(i,j,0,n+ncomp); - forw = 2.0_rt*(u(i,j+1,0,nu)-u(i,j ,0,nu)); - back = 2.0_rt*(u(i,j ,0,nu)-u(i,j-1,0,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,0,n+ncomp) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + forw = Real(2.)*(u(i,j+1,0,nu)-u(i,j ,0,nu)); + back = Real(2.)*(u(i,j ,0,nu)-u(i,j-1,0,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,0,n+ncomp) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,j,0,1) = amrex::min(sf(i,j,0,1), slopes(i,j,0,n+ncomp)/cen); } else { - sf(i,j,0,1) = 0.0_rt; + sf(i,j,0,1) = Real(0.); } } } @@ -270,16 +270,16 @@ cellconslin_slopes_mclim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,j,0,n); - Real forw = 2.0_rt*(u(i+1,j,0,nu)-u(i ,j,0,nu)); - Real back = 2.0_rt*(u(i ,j,0,nu)-u(i-1,j,0,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,0,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + Real forw = Real(2.)*(u(i+1,j,0,nu)-u(i ,j,0,nu)); + Real back = Real(2.)*(u(i ,j,0,nu)-u(i-1,j,0,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,0,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); cen = slopes(i,j,0,n+ncomp); - forw = 2.0_rt*(u(i,j+1,0,nu)-u(i,j ,0,nu)); - back = 2.0_rt*(u(i,j ,0,nu)-u(i,j-1,0,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,0,n+ncomp) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + forw = Real(2.)*(u(i,j+1,0,nu)-u(i,j ,0,nu)); + back = Real(2.)*(u(i,j ,0,nu)-u(i,j-1,0,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,0,n+ncomp) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); } } } @@ -309,12 +309,12 @@ cellconslin_fine_alpha (Box const& bx, Array4 const& alpha, const Real dummy_fine = xoff[i-vlo.x]*slopes(ic,jc,0,n) + yoff[j-vlo.y]*slopes(ic,jc,0,n+ncomp); - if (dummy_fine > mm(ic,jc,0,n+ncomp) && dummy_fine != 0.0_rt) { + if (dummy_fine > mm(ic,jc,0,n+ncomp) && dummy_fine != Real(0.)) { alpha(i,j,0,n) = mm(ic,jc,0,n+ncomp) / dummy_fine; - } else if (dummy_fine < mm(ic,jc,0,n) && dummy_fine != 0.0_rt) { + } else if (dummy_fine < mm(ic,jc,0,n) && dummy_fine != Real(0.)) { alpha(i,j,0,n) = mm(ic,jc,0,n) / dummy_fine; } else { - alpha(i,j,0,n) = 1.0_rt; + alpha(i,j,0,n) = Real(1.); } } } @@ -334,7 +334,7 @@ cellconslin_slopes_mmlim (Box const& bx, Array4 const& slopes, const int jj = j*ratio[1]; for (int i = lo.x; i <= hi.x; ++i) { const int ii = i*ratio[0]; - Real a = 1.0_rt; + Real a = Real(1.); for (int joff = 0; joff < ratio[1]; ++joff) { for (int ioff = 0; ioff < ratio[0]; ++ioff) { a = amrex::min(a, alpha(ii+ioff,jj+joff,0,n)); @@ -382,8 +382,8 @@ nodebilin_slopes (Box const& bx, Array4 const& slope, Array4 const& const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); - const Real rx = 1.0_rt/ratio[0]; - const Real ry = 1.0_rt/ratio[1]; + const Real rx = Real(1.)/ratio[0]; + const Real ry = Real(1.)/ratio[1]; for (int n = 0; n < ncomp; ++n) { for (int j = lo.y; j <= hi.y; ++j) { @@ -428,6 +428,28 @@ nodebilin_interp (Box const& bx, Array4 const& fine, const int fcomp, const i } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_x (int i, int j, int /*k*/, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + int jj = amrex::coarsen(j,ratio[1]); + Real const w = static_cast(i-ii*ratio[0]) * (Real(1.)/ratio[0]); + fine(i,j,0,n) = (Real(1.)-w) * crse(ii,jj,0,n) + w * crse(ii+1,jj,0,n); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_y (int i, int j, int /*k*/, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + int jj = amrex::coarsen(j,ratio[1]); + Real const w = static_cast(j-jj*ratio[1]) * (Real(1.)/ratio[1]); + fine(i,j,0,n) = (Real(1.)-w) * crse(ii,jj,0,n) + w * crse(ii,jj+1,0,n); +} + } #endif diff --git a/Src/AmrCore/AMReX_Interp_3D_C.H b/Src/AmrCore/AMReX_Interp_3D_C.H index c3450f2bd0d..091515b0a0e 100644 --- a/Src/AmrCore/AMReX_Interp_3D_C.H +++ b/Src/AmrCore/AMReX_Interp_3D_C.H @@ -11,8 +11,8 @@ namespace amrex { AMREX_GPU_HOST inline Vector -ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& cgeom, - Geometry const& fgeom) noexcept +ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& /*cgeom*/, + Geometry const& /*fgeom*/) noexcept { const Box& fbx = amrex::refine(cbx,ratio); const auto& flen = amrex::length(fbx); @@ -21,13 +21,13 @@ ccinterp_compute_voff (Box const& cbx, IntVect const& ratio, Geometry const& cge const int nxyz = flen.x + flen.y + flen.z; Vector voff(nxyz); - const Real xrinv = 1._rt/ratio[0]; - const Real yrinv = 1._rt/ratio[1]; - const Real zrinv = 1._rt/ratio[2]; + const Real xrinv = Real(1.)/ratio[0]; + const Real yrinv = Real(1.)/ratio[1]; + const Real zrinv = Real(1.)/ratio[2]; - const Real xtmp = 0.5_rt*(xrinv-1.0_rt); - const Real ytmp = 0.5_rt*(yrinv-1.0_rt); - const Real ztmp = 0.5_rt*(zrinv-1.0_rt); + const Real xtmp = Real(0.5)*(xrinv-Real(1.)); + const Real ytmp = Real(0.5)*(yrinv-Real(1.)); + const Real ztmp = Real(0.5)*(zrinv-Real(1.)); Real* AMREX_RESTRICT xoff = &voff[0]; AMREX_PRAGMA_SIMD @@ -68,9 +68,9 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns ) = 0.5_rt*(u(i+1,j,k,nu)-u(i-1,j,k,nu)); - slopes(i,j,k,ns+ncomp ) = 0.5_rt*(u(i,j+1,k,nu)-u(i,j-1,k,nu)); - slopes(i,j,k,ns+ncomp*2) = 0.5_rt*(u(i,j,k+1,nu)-u(i,j,k-1,nu)); + slopes(i,j,k,ns ) = Real(0.5)*(u(i+1,j,k,nu)-u(i-1,j,k,nu)); + slopes(i,j,k,ns+ncomp ) = Real(0.5)*(u(i,j+1,k,nu)-u(i,j-1,k,nu)); + slopes(i,j,k,ns+ncomp*2) = Real(0.5)*(u(i,j,k+1,nu)-u(i,j,k-1,nu)); } } } @@ -81,14 +81,14 @@ compute_slopes (const Dim3& lo, const Dim3& hi, if (shi.x-slo.x >= 1) { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,k,ns) = -(16._rt/15._rt)*u(i-1,j,k,nu) + 0.5_rt*u(i,j,k,nu) - + (2._rt/3._rt)*u(i+1,j,k,nu) - 0.1_rt*u(i+2,j,k,nu); + slopes(i,j,k,ns) = -Real(16./15.)*u(i-1,j,k,nu) + Real(0.5)*u(i,j,k,nu) + + Real(2./3.)*u(i+1,j,k,nu) - Real(0.1)*u(i+2,j,k,nu); } } } else { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,k,ns) = 0.25_rt*(u(i+1,j,k,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i-1,j,k,nu)); + slopes(i,j,k,ns) = Real(0.25)*(u(i+1,j,k,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i-1,j,k,nu)); } } } @@ -100,14 +100,14 @@ compute_slopes (const Dim3& lo, const Dim3& hi, if (shi.x-slo.x >= 1) { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,k,ns) = (16._rt/15._rt)*u(i+1,j,k,nu) - 0.5_rt*u(i,j,k,nu) - - (2._rt/3._rt)*u(i-1,j,k,nu) + 0.1_rt*u(i-2,j,k,nu); + slopes(i,j,k,ns) = Real(16./15.)*u(i+1,j,k,nu) - Real(0.5)*u(i,j,k,nu) + - Real(2./3.)*u(i-1,j,k,nu) + Real(0.1)*u(i-2,j,k,nu); } } } else { for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { - slopes(i,j,k,ns) = -0.25_rt*(u(i-1,j,k,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i+1,j,k,nu)); + slopes(i,j,k,ns) = -Real(0.25)*(u(i-1,j,k,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i+1,j,k,nu)); } } } @@ -120,15 +120,15 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int k = lo.z; k <= hi.z; ++k) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+ncomp) = -(16._rt/15._rt)*u(i,j-1,k,nu) + 0.5_rt*u(i,j,k,nu) - + (2._rt/3._rt)*u(i,j+1,k,nu) - 0.1_rt*u(i,j+2,k,nu); + slopes(i,j,k,ns+ncomp) = -Real(16./15.)*u(i,j-1,k,nu) + Real(0.5)*u(i,j,k,nu) + + Real(2./3.)*u(i,j+1,k,nu) - Real(0.1)*u(i,j+2,k,nu); } } } else { for (int k = lo.z; k <= hi.z; ++k) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+ncomp) = 0.25_rt*(u(i,j+1,k,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i,j-1,k,nu)); + slopes(i,j,k,ns+ncomp) = Real(0.25)*(u(i,j+1,k,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i,j-1,k,nu)); } } } @@ -141,15 +141,15 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int k = lo.z; k <= hi.z; ++k) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+ncomp) = (16._rt/15._rt)*u(i,j+1,k,nu) - 0.5_rt*u(i,j,k,nu) - - (2._rt/3._rt)*u(i,j-1,k,nu) + 0.1_rt*u(i,j-2,k,nu); + slopes(i,j,k,ns+ncomp) = Real(16./15.)*u(i,j+1,k,nu) - Real(0.5)*u(i,j,k,nu) + - Real(2./3.)*u(i,j-1,k,nu) + Real(0.1)*u(i,j-2,k,nu); } } } else { for (int k = lo.z; k <= hi.z; ++k) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+ncomp) = -0.25_rt*(u(i,j-1,k,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i,j+1,k,nu)); + slopes(i,j,k,ns+ncomp) = -Real(0.25)*(u(i,j-1,k,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i,j+1,k,nu)); } } } @@ -162,15 +162,15 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+2*ncomp) = -(16._rt/15._rt)*u(i,j,k-1,nu) + 0.5_rt*u(i,j,k,nu) - + (2._rt/3._rt)*u(i,j,k+1,nu) - 0.1_rt*u(i,j,k+2,nu); + slopes(i,j,k,ns+2*ncomp) = -Real(16./15.)*u(i,j,k-1,nu) + Real(0.5)*u(i,j,k,nu) + + Real(2./3.)*u(i,j,k+1,nu) - Real(0.1)*u(i,j,k+2,nu); } } } else { for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+2*ncomp) = 0.25_rt*(u(i,j,k+1,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i,j,k-1,nu)); + slopes(i,j,k,ns+2*ncomp) = Real(0.25)*(u(i,j,k+1,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i,j,k-1,nu)); } } } @@ -183,15 +183,15 @@ compute_slopes (const Dim3& lo, const Dim3& hi, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+2*ncomp) = (16._rt/15._rt)*u(i,j,k+1,nu) - 0.5_rt*u(i,j,k,nu) - - (2._rt/3._rt)*u(i,j,k-1,nu) + 0.1_rt*u(i,j,k-2,nu); + slopes(i,j,k,ns+2*ncomp) = Real(16./15.)*u(i,j,k+1,nu) - Real(0.5)*u(i,j,k,nu) + - Real(2./3.)*u(i,j,k-1,nu) + Real(0.1)*u(i,j,k-2,nu); } } } else { for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - slopes(i,j,k,ns+2*ncomp) = -0.25_rt*(u(i,j,k-1,nu)+5._rt*u(i,j,k,nu)-6._rt*u(i,j,k+1,nu)); + slopes(i,j,k,ns+2*ncomp) = -Real(0.25)*(u(i,j,k-1,nu)+Real(5.)*u(i,j,k,nu)-Real(6.)*u(i,j,k+1,nu)); } } } @@ -217,9 +217,9 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { - sf(i,j,k,0) = 1.0_rt; - sf(i,j,k,1) = 1.0_rt; - sf(i,j,k,2) = 1.0_rt; + sf(i,j,k,0) = Real(1.); + sf(i,j,k,1) = Real(1.); + sf(i,j,k,2) = Real(1.); } } } @@ -234,36 +234,36 @@ cellconslin_slopes_linlim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,j,k,n); - Real forw = 2.0_rt*(u(i+1,j,k,nu)-u(i ,j,k,nu)); - Real back = 2.0_rt*(u(i ,j,k,nu)-u(i-1,j,k,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + Real forw = Real(2.)*(u(i+1,j,k,nu)-u(i ,j,k,nu)); + Real back = Real(2.)*(u(i ,j,k,nu)-u(i-1,j,k,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,j,k,0) = amrex::min(sf(i,j,k,0), slopes(i,j,k,n)/cen); } else { - sf(i,j,k,0) = 0.0_rt; + sf(i,j,k,0) = Real(0.); } cen = slopes(i,j,k,n+ncomp); - forw = 2.0_rt*(u(i,j+1,k,nu)-u(i,j ,k,nu)); - back = 2.0_rt*(u(i,j ,k,nu)-u(i,j-1,k,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n+ncomp) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + forw = Real(2.)*(u(i,j+1,k,nu)-u(i,j ,k,nu)); + back = Real(2.)*(u(i,j ,k,nu)-u(i,j-1,k,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n+ncomp) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,j,k,1) = amrex::min(sf(i,j,k,1), slopes(i,j,k,n+ncomp)/cen); } else { - sf(i,j,k,1) = 0.0_rt; + sf(i,j,k,1) = Real(0.); } cen = slopes(i,j,k,n+ncomp*2); - forw = 2.0_rt*(u(i,j,k+1,nu)-u(i,j,k ,nu)); - back = 2.0_rt*(u(i,j,k ,nu)-u(i,j,k-1,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n+ncomp*2) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); - if (cen != 0.0_rt) { + forw = Real(2.)*(u(i,j,k+1,nu)-u(i,j,k ,nu)); + back = Real(2.)*(u(i,j,k ,nu)-u(i,j,k-1,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n+ncomp*2) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); + if (cen != Real(0.)) { sf(i,j,k,2) = amrex::min(sf(i,j,k,2), slopes(i,j,k,n+ncomp*2)/cen); } else { - sf(i,j,k,2) = 0.0_rt; + sf(i,j,k,2) = Real(0.); } } } @@ -361,22 +361,22 @@ cellconslin_slopes_mclim (Box const& bx, Array4 const& slopes, AMREX_PRAGMA_SIMD for (int i = lo.x; i <= hi.x; ++i) { Real cen = slopes(i,j,k,n); - Real forw = 2.0_rt*(u(i+1,j,k,nu)-u(i ,j,k,nu)); - Real back = 2.0_rt*(u(i ,j,k,nu)-u(i-1,j,k,nu)); - Real slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + Real forw = Real(2.)*(u(i+1,j,k,nu)-u(i ,j,k,nu)); + Real back = Real(2.)*(u(i ,j,k,nu)-u(i-1,j,k,nu)); + Real slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); cen = slopes(i,j,k,n+ncomp); - forw = 2.0_rt*(u(i,j+1,k,nu)-u(i,j ,k,nu)); - back = 2.0_rt*(u(i,j ,k,nu)-u(i,j-1,k,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n+ncomp) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + forw = Real(2.)*(u(i,j+1,k,nu)-u(i,j ,k,nu)); + back = Real(2.)*(u(i,j ,k,nu)-u(i,j-1,k,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n+ncomp) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); cen = slopes(i,j,k,n+ncomp*2); - forw = 2.0_rt*(u(i,j,k+1,nu)-u(i,j,k ,nu)); - back = 2.0_rt*(u(i,j,k ,nu)-u(i,j,k-1,nu)); - slp = (forw*back >= 0.0_rt) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : 0.0_rt; - slopes(i,j,k,n+ncomp*2) = amrex::Math::copysign(1.0_rt,cen)*amrex::min(slp,amrex::Math::abs(cen)); + forw = Real(2.)*(u(i,j,k+1,nu)-u(i,j,k ,nu)); + back = Real(2.)*(u(i,j,k ,nu)-u(i,j,k-1,nu)); + slp = (forw*back >= Real(0.)) ? amrex::min(amrex::Math::abs(forw),amrex::Math::abs(back)) : Real(0.); + slopes(i,j,k,n+ncomp*2) = amrex::Math::copysign(Real(1.),cen)*amrex::min(slp,amrex::Math::abs(cen)); } } } @@ -411,12 +411,12 @@ cellconslin_fine_alpha (Box const& bx, Array4 const& alpha, + yoff[j-vlo.y]*slopes(ic,jc,kc,n+ncomp) + zoff[k-vlo.z]*slopes(ic,jc,kc,n+ncomp*2); - if (dummy_fine > mm(ic,jc,kc,n+ncomp) && dummy_fine != 0.0_rt) { + if (dummy_fine > mm(ic,jc,kc,n+ncomp) && dummy_fine != Real(0.)) { alpha(i,j,k,n) = mm(ic,jc,kc,n+ncomp) / dummy_fine; - } else if (dummy_fine < mm(ic,jc,kc,n) && dummy_fine != 0.0_rt) { + } else if (dummy_fine < mm(ic,jc,kc,n) && dummy_fine != Real(0.)) { alpha(i,j,k,n) = mm(ic,jc,kc,n) / dummy_fine; } else { - alpha(i,j,k,n) = 1.0_rt; + alpha(i,j,k,n) = Real(1.); } } } @@ -439,7 +439,7 @@ cellconslin_slopes_mmlim (Box const& bx, Array4 const& slopes, const int jj = j*ratio[1]; for (int i = lo.x; i <= hi.x; ++i) { const int ii = i*ratio[0]; - Real a = 1.0_rt; + Real a = Real(1.); for (int koff = 0; koff < ratio[2]; ++koff) { for (int joff = 0; joff < ratio[1]; ++joff) { for (int ioff = 0; ioff < ratio[0]; ++ioff) { @@ -498,9 +498,9 @@ nodebilin_slopes (Box const& bx, Array4 const& slope, Array4 const& const auto lo = amrex::lbound(bx); const auto hi = amrex::ubound(bx); - const Real rx = 1.0_rt/ratio[0]; - const Real ry = 1.0_rt/ratio[1]; - const Real rz = 1.0_rt/ratio[2]; + const Real rx = Real(1.)/ratio[0]; + const Real ry = Real(1.)/ratio[1]; + const Real rz = Real(1.)/ratio[2]; for (int n = 0; n < ncomp; ++n) { const int nu = n + icomp; @@ -566,6 +566,42 @@ nodebilin_interp (Box const& bx, Array4 const& fine, const int fcomp, const i } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_x (int i, int j, int k, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + int jj = amrex::coarsen(j,ratio[1]); + int kk = amrex::coarsen(k,ratio[2]); + Real const w = static_cast(i-ii*ratio[0]) * (Real(1.)/ratio[0]); + fine(i,j,k,n) = (Real(1.)-w) * crse(ii,jj,kk,n) + w * crse(ii+1,jj,kk,n); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_y (int i, int j, int k, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + int jj = amrex::coarsen(j,ratio[1]); + int kk = amrex::coarsen(k,ratio[2]); + Real const w = static_cast(j-jj*ratio[1]) * (Real(1.)/ratio[1]); + fine(i,j,k,n) = (Real(1.)-w) * crse(ii,jj,kk,n) + w * crse(ii,jj+1,kk,n); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void +face_linear_interp_z (int i, int j, int k, int n, Array4 const& fine, + Array4 const& crse, IntVect const& ratio) noexcept +{ + int ii = amrex::coarsen(i,ratio[0]); + int jj = amrex::coarsen(j,ratio[1]); + int kk = amrex::coarsen(k,ratio[2]); + Real const w = static_cast(k-kk*ratio[2]) * (Real(1.)/ratio[2]); + fine(i,j,k,n) = (Real(1.)-w) * crse(ii,jj,kk,n) + w * crse(ii,jj,kk+1,n); +} + } #endif diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H index 3896df24d5e..24cfeb0d6bd 100644 --- a/Src/AmrCore/AMReX_Interpolater.H +++ b/Src/AmrCore/AMReX_Interpolater.H @@ -98,19 +98,19 @@ public: * \param fine_geom * \param bcr */ - virtual void protect (const FArrayBox& crse, - int crse_comp, - FArrayBox& fine, - int fine_comp, - FArrayBox& fine_state, - int state_comp, - int ncomp, - const Box& fine_region, - const IntVect& ratio, - const Geometry& crse_geom, - const Geometry& fine_geom, - Vector& bcr, - RunOn gpu_or_cpu) {}; + virtual void protect (const FArrayBox& /*crse*/, + int /*crse_comp*/, + FArrayBox& /*fine*/, + int /*fine_comp*/, + FArrayBox& /*fine_state*/, + int /*state_comp*/, + int /*ncomp*/, + const Box& /*fine_region*/, + const IntVect& /*ratio*/, + const Geometry& /*crse_geom*/, + const Geometry& /*fine_geom*/, + Vector& /*bcr*/, + RunOn /*gpu_or_cpu*/) {} virtual InterpolaterBoxCoarsener BoxCoarsener (const IntVect& ratio); @@ -646,10 +646,76 @@ public: }; #endif +/** +* \brief Bilinear interpolation on face data. +* +* Bilinear interpolation on data. +*/ + +class FaceLinear + : + public Interpolater +{ +public: + + /** + * \brief The destructor. + */ + virtual ~FaceLinear () override; + + /** + * \brief Returns coarsened box given fine box and refinement ratio. + * + * \param fine + * \param ratio + */ + virtual Box CoarseBox (const Box& fine, + int ratio) override; + + /** + * \brief Returns coarsened box given fine box and refinement ratio. + * + * \param fine + * \param ratio + */ + virtual Box CoarseBox (const Box& fine, + const IntVect& ratio) override; + + /** + * \brief Coarse to fine interpolation in space. + * + * \param crse + * \param crse_comp + * \param fine + * \param fine_comp + * \param ncomp + * \param fine_region + * \param ratio + * \param crse_geom + * \param fine_geom + * \param bcr + * \param actual_comp + * \param actual_state + */ + virtual void interp (const FArrayBox& crse, + int crse_comp, + FArrayBox& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + const Geometry& crse_geom, + const Geometry& fine_geom, + Vector const& bcr, + int actual_comp, + int actual_state, + RunOn gpu_or_cpu) override; +}; //! CONSTRUCT A GLOBAL OBJECT OF EACH VERSION. extern PCInterp pc_interp; extern NodeBilinear node_bilinear_interp; +extern FaceLinear face_linear_interp; extern CellConservativeLinear lincc_interp; extern CellConservativeLinear cell_cons_interp; diff --git a/Src/AmrCore/AMReX_Interpolater.cpp b/Src/AmrCore/AMReX_Interpolater.cpp index 4da717bdc0a..570fdf0e9c6 100644 --- a/Src/AmrCore/AMReX_Interpolater.cpp +++ b/Src/AmrCore/AMReX_Interpolater.cpp @@ -13,7 +13,8 @@ namespace amrex { // -// PCInterp, NodeBilinear, and CellConservativeLinear are supported for all dimensions on cpu and gpu. +// PCInterp, NodeBilinear, FaceLinear, and CellConservativeLinear are supported for all dimensions +// on cpu and gpu. // // CellConsertiveProtected only works in 2D and 3D on cpu. // @@ -29,6 +30,7 @@ namespace amrex { // PCInterp pc_interp; NodeBilinear node_bilinear_interp; +FaceLinear face_linear_interp; CellConservativeLinear lincc_interp; CellConservativeLinear cell_cons_interp(0); @@ -141,6 +143,76 @@ NodeBilinear::interp (const FArrayBox& crse, }); } +Box +FaceLinear::CoarseBox (const Box& fine, int ratio) +{ + return CoarseBox(fine, IntVect(ratio)); +} + +Box +FaceLinear::CoarseBox (const Box& fine, const IntVect& ratio) +{ + Box b = amrex::coarsen(fine,ratio); + for (int i = 0; i < AMREX_SPACEDIM; i++) { + if (b.type(i) == IndexType::NODE && b.length(i) < 2) { + // Don't want degenerate boxes in nodal direction. + b.growHi(i,1); + } + } + return b; +} + +void +FaceLinear::interp (const FArrayBox& crse, + int crse_comp, + FArrayBox& fine, + int fine_comp, + int ncomp, + const Box& fine_region, + const IntVect& ratio, + const Geometry& /*crse_geom */, + const Geometry& /*fine_geom */, + Vector const& /*bcr*/, + int /*actual_comp*/, + int /*actual_state*/, + RunOn runon) +{ + BL_PROFILE("FaceLinear::interp()"); + + AMREX_ASSERT(AMREX_D_TERM(fine_region.type(0),+fine_region.type(1),+fine_region.type(2)) == 1); + + Array4 const& fine_arr = fine.array(fine_comp); + Array4 const& crse_arr = crse.const_array(crse_comp); + + if (fine_region.type(0) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_x(i,j,k,n,fine_arr,crse_arr,ratio); + }); + } +#if (AMREX_SPACEDIM >= 2) + else if (fine_region.type(1) == IndexType::NODE) + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_y(i,j,k,n,fine_arr,crse_arr,ratio); + }); + } +#if (AMREX_SPACEDIM == 3) + else + { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon,fine_region,ncomp,i,j,k,n, + { + face_linear_interp_z(i,j,k,n,fine_arr,crse_arr,ratio); + }); + } +#endif +#endif +} + +FaceLinear::~FaceLinear () {} + #ifndef BL_NO_FORT CellBilinear::~CellBilinear () {} @@ -187,7 +259,7 @@ CellBilinear::interp (const FArrayBox& crse, Vector const& /*bcr*/, int actual_comp, int actual_state, - RunOn runon) + RunOn /*runon*/) { BL_PROFILE("CellBilinear::interp()"); // @@ -399,8 +471,13 @@ CellQuadratic::interp (const FArrayBox& crse, Vector const& bcr, int actual_comp, int actual_state, - RunOn runon) + RunOn /*runon*/) { +#if (AMREX_SPACEDIM == 1) + amrex::ignore_unused(crse,crse_comp,fine,fine_comp,ncomp,fine_region, + ratio,crse_geom,fine_geom,bcr,actual_comp,actual_state); + amrex::Abort("1D CellQuadratic::interp not supported"); +#else BL_PROFILE("CellQuadratic::interp()"); BL_ASSERT(bcr.size() >= ncomp); // @@ -471,8 +548,6 @@ CellQuadratic::interp (const FArrayBox& crse, Vector bc = GetBCArray(bcr); const int* ratioV = ratio.getVect(); -#if (AMREX_SPACEDIM > 1) - amrex_cqinterp (fdat,AMREX_ARLIM(flo),AMREX_ARLIM(fhi), AMREX_ARLIM(fblo), AMREX_ARLIM(fbhi), &ncomp,AMREX_D_DECL(&ratioV[0],&ratioV[1],&ratioV[2]), @@ -566,8 +641,8 @@ CellConservativeProtected::interp (const FArrayBox& crse, const Geometry& crse_geom, const Geometry& fine_geom, Vector const& bcr, - int actual_comp, - int actual_state, + int /*actual_comp*/, + int /*actual_state*/, RunOn runon) { BL_PROFILE("CellConservativeProtected::interp()"); @@ -630,8 +705,14 @@ CellConservativeProtected::protect (const FArrayBox& crse, const Geometry& crse_geom, const Geometry& fine_geom, Vector& bcr, - RunOn runon) + RunOn /*runon*/) { +#if (AMREX_SPACEDIM == 1) + amrex::ignore_unused(crse,crse_comp,fine,fine_comp,fine_state, + state_comp,ncomp,fine_region,ratio, + crse_geom,fine_geom,bcr); + amrex::Abort("1D CellConservativeProtected::protect not supported"); +#else BL_PROFILE("CellConservativeProtected::protect()"); BL_ASSERT(bcr.size() >= ncomp); @@ -695,8 +776,6 @@ CellConservativeProtected::protect (const FArrayBox& crse, Vector bc = GetBCArray(bcr); const int* ratioV = ratio.getVect(); -#if (AMREX_SPACEDIM > 1) - amrex_protect_interp (fdat,AMREX_ARLIM(flo),AMREX_ARLIM(fhi), fblo, fbhi, cdat,AMREX_ARLIM(clo),AMREX_ARLIM(chi), @@ -711,7 +790,7 @@ CellConservativeProtected::protect (const FArrayBox& crse, &ncomp,AMREX_D_DECL(&ratioV[0],&ratioV[1],&ratioV[2]), bc.dataPtr()); -#endif /*(AMREX_SPACEDIM > 1)*/ +#endif /*(AMREX_SPACEDIM == 1)*/ } #endif @@ -750,7 +829,7 @@ CellConservativeQuartic::interp (const FArrayBox& crse, Vector const& bcr, int actual_comp, int actual_state, - RunOn runon) + RunOn /*runon*/) { BL_PROFILE("CellConservativeQuartic::interp()"); BL_ASSERT(bcr.size() >= ncomp); diff --git a/Src/AmrCore/AMReX_TagBox.H b/Src/AmrCore/AMReX_TagBox.H index c92bd18fe43..433da05b983 100644 --- a/Src/AmrCore/AMReX_TagBox.H +++ b/Src/AmrCore/AMReX_TagBox.H @@ -57,56 +57,24 @@ public: /** * \brief Construct and return a new tagbox in which the coarsened cell * is tagged of any of the corresponding fine cells are tagged. - * - * \param ratio - * \param owner */ - void coarsen (const IntVect& ratio) noexcept; + void coarsen (const IntVect& ratio, const Box& cbox) noexcept; /** * \brief Mark neighbors of every tagged cell a distance nbuff away - * only search interior for initial tagged points where nwid - * is given as the width of the bndry region. * * \param nbuff - * \param nwid - */ - void buffer (const IntVect& nbuf, const IntVect& nwid) noexcept; - - /** - * \brief Tag cells on intersect with src if corresponding src cell is tagged. - * - * \param src - */ - void merge (const TagBox& src) noexcept; - - /** - * \brief Add location of every tagged cell to IntVect array, - * starting at given location. Returns the number of - * collated points. - * - * \param ar - * \param start */ - Long collate (Vector& ar, int start) const noexcept; - - /** - * \brief Returns number of tagged cells in specified Box. - * - * \param bx - */ - Long numTags (const Box& bx) const noexcept; - - /** - * \brief Returns total number of tagged cells in the TagBox. - */ - Long numTags () const noexcept; + void buffer (const IntVect& nbuf) noexcept; /** * \brief Returns Vector\ of size domain.numPts() suitable for calling * Fortran, with positions set to same value as in the TagBox * dataPtr(). */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif Vector tags () const noexcept; /** @@ -117,6 +85,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void get_itags(Vector& ar, const Box& tilebx) const noexcept; /** @@ -125,6 +96,9 @@ public: * * \param ar */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags (const Vector& ar) noexcept; /** @@ -133,6 +107,9 @@ public: * * \param ar */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags_and_untags (const Vector& ar) noexcept; /** @@ -142,6 +119,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags (const Vector& ar, const Box& tilebx) noexcept; /** @@ -151,6 +131,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags_and_untags (const Vector& ar, const Box& tilebx) noexcept; }; @@ -183,16 +166,10 @@ public: ~TagBoxArray () override = default; TagBoxArray (TagBoxArray&& rhs) noexcept = default; + TagBoxArray& operator= (TagBoxArray&& rhs) noexcept = default; TagBoxArray (const TagBoxArray& rhs) = delete; TagBoxArray& operator= (const TagBoxArray& rhs) = delete; - TagBoxArray& operator= (TagBoxArray&& rhs) = delete; - - - /** - * \brief Returns the grow factor for the TagBoxArray. - */ - IntVect borderSize () const noexcept; /** * \brief Calls buffer() on all contained TagBoxes. @@ -202,12 +179,12 @@ public: void buffer (const IntVect& nbuf); /** - * \brief Map tagged cells through a periodic boundary to other grids in - * TagBoxArray cells which were outside domain are set to TagBox::CLEAR. + * \brief This funciton does two things. Map tagged cells through a periodic boundary to other + * grids in TagBoxArray cells, and remove duplicates. * * \param geom */ - void mapPeriodic (const Geometry& geom); + void mapPeriodicRemoveDuplicates (const Geometry& geom); /** * \brief Set values in bl to val. @@ -241,17 +218,20 @@ public: */ void coarsen (const IntVect& ratio); - /** - * \brief The total number of Tags in all the contained TagBoxes. - */ - Long numTags () const; - /** * \brief Calls collate() on all contained TagBoxes. * * \param TheGlobalCollateSpace */ void collate (Vector& TheGlobalCollateSpace) const; + + // \brief Are there tags in the region defined by bx? + bool hasTags (Box const& bx) const; + + void local_collate_cpu (Vector& v) const; +#ifdef AMREX_USE_GPU + void local_collate_gpu (Vector& v) const; +#endif }; } diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index 7136d1b849b..48b71791108 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include namespace amrex { @@ -19,257 +21,87 @@ TagBox::TagBox (Arena* ar) noexcept TagBox::TagBox (const Box& bx, int n, Arena* ar) : BaseFab(bx,n,ar) -{ - setVal(TagBox::CLEAR); -} +{} TagBox::TagBox (const Box& bx, int n, bool alloc, bool shared, Arena* ar) : BaseFab(bx,n,alloc,shared,ar) -{ - if (alloc) setVal(TagBox::CLEAR); -} +{} TagBox::TagBox (const TagBox& rhs, MakeType make_type, int scomp, int ncomp) : BaseFab(rhs,make_type,scomp,ncomp) {} void -TagBox::coarsen (const IntVect& ratio) noexcept +TagBox::coarsen (const IntVect& ratio, const Box& cbox) noexcept { BL_ASSERT(nComp() == 1); + Array4 const& farr = this->const_array(); - TagType* fdat = dataPtr(); - IntVect lov = domain.smallEnd(); - IntVect hiv = domain.bigEnd(); - IntVect d_length = domain.size(); - const int* flo = lov.getVect(); - const int* fhi = hiv.getVect(); - const int* flen = d_length.getVect(); + TagBox cfab(cbox, 1, The_Arena()); + Elixir eli = cfab.elixir(); + Array4 const& carr = cfab.array(); - const Box& cbox = amrex::coarsen(domain,ratio); + Box fdomain = domain; + Dim3 r{1,1,1}; + AMREX_D_TERM(r.x = ratio[0];, r.y = ratio[1];, r.z = ratio[2]); - this->nvar = 1; - this->domain = cbox; - - const int* clo = cbox.loVect(); - IntVect cbox_len = cbox.size(); - const int* clen = cbox_len.getVect(); - - Box b1(amrex::refine(cbox,ratio)); - const int* lo = b1.loVect(); - int longlen = b1.longside(); - - Long numpts = domain.numPts(); - Vector cfab(numpts); - TagType* cdat = cfab.dataPtr(); - - Vector t(longlen,TagBox::CLEAR); - - int klo = 0, khi = 0, jlo = 0, jhi = 0, ilo, ihi; - AMREX_D_TERM(ilo=flo[0]; ihi=fhi[0]; , - jlo=flo[1]; jhi=fhi[1]; , - klo=flo[2]; khi=fhi[2];) - -#define IXPROJ(i,r) (((i)+(r)*std::abs(i))/(r) - std::abs(i)) -#define IOFF(j,k,lo,len) AMREX_D_TERM(0, +(j-lo[1])*len[0], +(k-lo[2])*len[0]*len[1]) - - int ratiox = 1, ratioy = 1, ratioz = 1; - AMREX_D_TERM(ratiox = ratio[0];, - ratioy = ratio[1];, - ratioz = ratio[2];) - - for (int k = klo; k <= khi; k++) - { - const int kc = IXPROJ(k,ratioz); - amrex::ignore_unused(kc); - for (int j = jlo; j <= jhi; j++) - { - const int jc = IXPROJ(j,ratioy); - TagType* c = cdat + IOFF(jc,kc,clo,clen); - const TagType* f = fdat + IOFF(j,k,flo,flen); - // - // Copy fine grid row of values into tmp array. - // - for (int i = ilo; i <= ihi; i++) - t[i-lo[0]] = f[i-ilo]; - - for (int off = 0; off < ratiox; off++) - { - for (int ic = 0; ic < clen[0]; ic++) - { - const int i = ic*ratiox + off; - c[ic] = std::max(c[ic],t[i]); - } - } - } - } - -#undef IXPROJ -#undef IOFF - - for (int i = 0; i < numpts; ++i) { - fdat[i] = cdat[i]; - } -} - -void -TagBox::buffer (const IntVect& nbuff, const IntVect& nwid) noexcept -{ - // - // Note: this routine assumes cell with TagBox::SET tag are in - // interior of tagbox (region = grow(domain,-nwid)). - // - Box inside(domain); - inside.grow(-nwid); - const int* inlo = inside.loVect(); - const int* inhi = inside.hiVect(); - - int klo = 0, khi = 0, jlo = 0, jhi = 0, ilo, ihi; - AMREX_D_TERM(ilo=inlo[0]; ihi=inhi[0]; , - jlo=inlo[1]; jhi=inhi[1]; , - klo=inlo[2]; khi=inhi[2];) - - int ni = 0, nj = 0, nk = 0; - AMREX_D_TERM(ni=nbuff[0];, nj=nbuff[1];, nk=nbuff[2];) - - IntVect d_length = domain.size(); - const int* len = d_length.getVect(); - const int* lo = domain.loVect(); - TagType* d = dataPtr(); - -#define OFF(i,j,k,lo,len) AMREX_D_TERM(i-lo[0], +(j-lo[1])*len[0] , +(k-lo[2])*len[0]*len[1]) - - for (int k = klo; k <= khi; k++) + AMREX_HOST_DEVICE_FOR_3D(cbox, i, j, k, { - for (int j = jlo; j <= jhi; j++) - { - for (int i = ilo; i <= ihi; i++) - { - TagType* d_check = d + OFF(i,j,k,lo,len); - if (*d_check == TagBox::SET) - { - for (int kk = -nk; kk <= nk; kk++) - { - for (int jj = -nj; jj <= nj; jj++) - { - for (int ii = -ni; ii <= ni; ii++) - { - TagType* dn = d_check+ AMREX_D_TERM(ii, +jj*len[0], +kk*len[0]*len[1]); - if (*dn !=TagBox::SET) - *dn = TagBox::BUF; - } - } + TagType t = TagBox::CLEAR; + for (int koff = 0; koff < r.z; ++koff) { + int kk = k*r.z + koff; + for (int joff = 0; joff < r.y; ++joff) { + int jj = j*r.y + joff; + for (int ioff = 0; ioff < r.x; ++ioff) { + int ii = i*r.x + ioff; + if (fdomain.contains(IntVect(AMREX_D_DECL(ii,jj,kk)))) { + t = t || farr(ii,jj,kk); } } } } - } -#undef OFF -} + carr(i,j,k) = t; + }); -void -TagBox::merge (const TagBox& src) noexcept -{ - // - // Compute intersections. - // - const Box& bx = domain & src.domain; - - if (bx.ok()) - { - const int* dlo = domain.loVect(); - IntVect d_length = domain.size(); - const int* dleng = d_length.getVect(); - const int* slo = src.domain.loVect(); - IntVect src_length = src.domain.size(); - const int* sleng = src_length.getVect(); - const int* lo = bx.loVect(); - const int* hi = bx.hiVect(); - const TagType* ds0 = src.dataPtr(); - TagType* dd0 = dataPtr(); - - int klo = 0, khi = 0, jlo = 0, jhi = 0, ilo, ihi; - AMREX_D_TERM(ilo=lo[0]; ihi=hi[0]; , - jlo=lo[1]; jhi=hi[1]; , - klo=lo[2]; khi=hi[2];) - -#define OFF(i,j,k,lo,len) AMREX_D_TERM(i-lo[0], +(j-lo[1])*len[0] , +(k-lo[2])*len[0]*len[1]) - - for (int k = klo; k <= khi; k++) - { - for (int j = jlo; j <= jhi; j++) - { - for (int i = ilo; i <= ihi; i++) - { - const TagType* ds = ds0 + OFF(i,j,k,slo,sleng); - if (*ds != TagBox::CLEAR) - { - TagType* dd = dd0 + OFF(i,j,k,dlo,dleng); - *dd = TagBox::SET; - } - } - } - } - } -#undef OFF -} - -Long -TagBox::numTags () const noexcept -{ - Long nt = 0L; - Long len = domain.numPts(); - const TagType* d = dataPtr(); - for (Long n = 0; n < len; ++n) +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + Gpu::dtod_memcpy_async(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts()); + } else +#endif { - if (d[n] != TagBox::CLEAR) - ++nt; + std::memcpy(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts()); } - return nt; -} - -Long -TagBox::numTags (const Box& b) const noexcept -{ - TagBox tempTagBox(b,1); - tempTagBox.copy(*this); - return tempTagBox.numTags(); + this->domain = cbox; } -Long -TagBox::collate (Vector& ar, int start) const noexcept +void +TagBox::buffer (const IntVect& a_nbuff) noexcept { - BL_ASSERT(start >= 0); - // - // Starting at given offset of array ar, enter location (IntVect) of - // each tagged cell in tagbox. - // - Long count = 0; - IntVect d_length = domain.size(); - const int* len = d_length.getVect(); - const int* lo = domain.loVect(); - const TagType* d = dataPtr(); - int ni = 1, nj = 1, nk = 1; - AMREX_D_TERM(ni = len[0]; , nj = len[1]; , nk = len[2];) - - for (int k = 0; k < nk; k++) + Array4 const& a = this->array(); + Dim3 nbuf = a_nbuff.dim3(); + const auto lo = amrex::lbound(domain); + const auto hi = amrex::ubound(domain); + AMREX_HOST_DEVICE_FOR_3D(domain, i, j, k, { - for (int j = 0; j < nj; j++) - { - for (int i = 0; i < ni; i++) - { - const TagType* dn = d + AMREX_D_TERM(i, +j*len[0], +k*len[0]*len[1]); - if (*dn != TagBox::CLEAR) - { - ar[start++] = IntVect(AMREX_D_DECL(lo[0]+i,lo[1]+j,lo[2]+k)); - count++; - } - } + if (a(i,j,k) == TagBox::CLEAR) { + bool to_buf = false; + int imin = amrex::max(i-nbuf.x, lo.x); + int jmin = amrex::max(j-nbuf.y, lo.y); + int kmin = amrex::max(k-nbuf.z, lo.z); + int imax = amrex::min(i+nbuf.x, hi.x); + int jmax = amrex::min(j+nbuf.y, hi.y); + int kmax = amrex::min(k+nbuf.z, hi.z); + for (int kk = kmin; kk <= kmax && !to_buf; ++kk) { + for (int jj = jmin; jj <= jmax && !to_buf; ++jj) { + for (int ii = imin; ii <= imax && !to_buf; ++ii) { + if (a(ii,jj,kk) == TagBox::SET) to_buf = true; + }}} + if (to_buf) a(i,j,k) = TagBox::BUF; } - } - return count; + }); } +// DEPRECATED Vector TagBox::tags () const noexcept { @@ -287,7 +119,7 @@ TagBox::tags () const noexcept return ar; } - +// DEPRECATED // Set values as specified by the array -- this only tags. // It's an error if ar.length() != domain.numPts(). void @@ -305,6 +137,7 @@ TagBox::tags (const Vector& ar) noexcept } } +// DEPRECATED // Set values as specified by the array -- this tags and untags. // It's an error if ar.length() != domain.numPts(). void @@ -322,6 +155,7 @@ TagBox::tags_and_untags (const Vector& ar) noexcept } } +// DEPRECATED // Since a TagBox is a BaseFab, we can use this utility // function to allocate an integer array to have the same number // of elements as cells in tilebx @@ -364,6 +198,7 @@ TagBox::get_itags(Vector& ar, const Box& tilebx) const noexcept } } +// DEPRECATED // Set values as specified by the array -- this only tags. // only changes values in the tilebx region void @@ -397,6 +232,7 @@ TagBox::tags (const Vector& ar, const Box& tilebx) noexcept } } +// DEPRECATED // Set values as specified by the array -- this tags and untags. // only changes values in the tilebx region void @@ -436,7 +272,7 @@ TagBoxArray::TagBoxArray (const BoxArray& ba, : FabArray(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory()) { - if (SharedMemory()) setVal(TagBox::CLEAR); + setVal(TagBox::CLEAR); } TagBoxArray::TagBoxArray (const BoxArray& ba, @@ -445,145 +281,351 @@ TagBoxArray::TagBoxArray (const BoxArray& ba, : FabArray(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory()) { - if (SharedMemory()) setVal(TagBox::CLEAR); -} - -IntVect -TagBoxArray::borderSize () const noexcept -{ - return n_grow; + setVal(TagBox::CLEAR); } void TagBoxArray::buffer (const IntVect& nbuf) { - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu - AMREX_ASSERT(nbuf.allLE(n_grow)); if (nbuf.max() > 0) { #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (Gpu::notInLaunchRegion()) #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) - get(mfi).buffer(nbuf, n_grow); + for (MFIter mfi(*this); mfi.isValid(); ++mfi) { + get(mfi).buffer(nbuf); + } } } void -TagBoxArray::mapPeriodic (const Geometry& geom) +TagBoxArray::mapPeriodicRemoveDuplicates (const Geometry& geom) { - if (!geom.isAnyPeriodic()) return; - - BL_PROFILE("TagBoxArray::mapPeriodic()"); + BL_PROFILE("TagBoxArray::mapPRD"); - // This function is called after coarsening. - // So we can assume that n_grow is 0. - BL_ASSERT(n_grow[0] == 0); + if (Gpu::inLaunchRegion()) + { + // There is not atomicAdd for char. So we have to use int. + iMultiFab itag = amrex::cast(*this); + iMultiFab tmp(boxArray(),DistributionMap(),1,nGrowVect()); + tmp.setVal(0); + tmp.ParallelAdd(itag, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity()); + + // We need to keep tags in periodic boundary + const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect()); +#ifdef _OPENMP +#pragma omp parallel +#endif + for (MFIter mfi(tmp); mfi.isValid(); ++mfi) { + Box const& box = mfi.fabbox(); + Array4 const& tag =this->array(mfi); + Array4 const& tmptag = tmp.const_array(mfi); + Array4 const& msk = owner_mask->const_array(mfi); + amrex::ParallelFor(box, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + if (msk(i,j,k)) { + tag(i,j,k) = static_cast(tmptag(i,j,k)); + } else { + tag(i,j,k) = TagBox::CLEAR; + } + }); + } + } + else + { + TagBoxArray tmp(boxArray(),DistributionMap(),nGrowVect()); // note that tmp is filled w/ CLEAR. + tmp.ParallelAdd(*this, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity()); - TagBoxArray tmp(boxArray(),DistributionMap()); // note that tmp is filled w/ CLEAR. + // We need to keep tags in periodic boundary + const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect()); +#ifdef _OPENMP +#pragma omp parallel +#endif + for (MFIter mfi(tmp); mfi.isValid(); ++mfi) { + Box const& box = mfi.fabbox(); + Array4 const& tag = tmp.array(mfi); + Array4 const& msk = owner_mask->const_array(mfi); + AMREX_LOOP_3D(box, i, j, k, + { + if (!msk(i,j,k)) tag(i,j,k) = TagBox::CLEAR; + }); + } - tmp.copy(*this, geom.periodicity(), FabArrayBase::ADD); + std::swap(*this, tmp); + } +} - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu +void +TagBoxArray::local_collate_cpu (Vector& v) const +{ + if (this->local_size() == 0) return; + Vector count(this->local_size()); #ifdef _OPENMP #pragma omp parallel #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) + for (MFIter fai(*this); fai.isValid(); ++fai) { - get(mfi).merge(tmp[mfi]); + Array4 const& arr = this->const_array(fai); + Box const& bx = fai.fabbox(); + int c = 0; + AMREX_LOOP_3D(bx,i,j,k, + { + if (arr(i,j,k) != TagBox::CLEAR) ++c; + }); + count[fai.LocalIndex()] = c; } -} -Long -TagBoxArray::numTags () const -{ - Long ntag = 0; + Vector offset(count.size()+1); + offset[0] = 0; + std::partial_sum(count.begin(), count.end(), offset.begin()+1); - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + v.resize(offset.back()); + + if (v.empty()) return; #ifdef _OPENMP -#pragma omp parallel reduction(+:ntag) +#pragma omp parallel #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) + for (MFIter fai(*this); fai.isValid(); ++fai) { - ntag += get(mfi).numTags(); + int li = fai.LocalIndex(); + if (count[li] > 0) { + IntVect* p = v.data() + offset[li]; + Array4 const& arr = this->const_array(fai); + Box const& bx = fai.fabbox(); + AMREX_LOOP_3D(bx,i,j,k, + { + if (arr(i,j,k) != TagBox::CLEAR) { + *p++ = IntVect(AMREX_D_DECL(i,j,k)); + } + }); + } } - - ParallelDescriptor::ReduceLongSum(ntag); - - return ntag; } +#ifdef AMREX_USE_GPU void -TagBoxArray::collate (Vector& TheGlobalCollateSpace) const +TagBoxArray::local_collate_gpu (Vector& v) const { - BL_PROFILE("TagBoxArray::collate()"); + const int nfabs = this->local_size(); + if (nfabs == 0) return; - // Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + constexpr int block_size = 128; + Vector nblocks(nfabs); + for (MFIter fai(*this); fai.isValid(); ++fai) + { + Box const& bx = fai.fabbox(); + nblocks[fai.LocalIndex()] = (bx.numPts() + block_size-1) / block_size; + } + Vector blockoffset(nblocks.size()+1); + blockoffset[0] = 0; + std::partial_sum(nblocks.begin(), nblocks.end(), blockoffset.begin()+1); + int ntotblocks = blockoffset.back(); - Long count = 0; + PODVector > dv_ntags(ntotblocks); -#ifdef _OPENMP -#pragma omp parallel reduction(+:count) -#endif for (MFIter fai(*this); fai.isValid(); ++fai) { - count += get(fai).numTags(); + const int li = fai.LocalIndex(); + int* ntags = dv_ntags.data() + blockoffset[li]; + const int ncells = fai.fabbox().numPts(); + const char* tags = (*this)[fai].dataPtr(); +#ifdef AMREX_USE_DPCPP + amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size, + Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + { + int bid = h.item.get_group_linear_id(); + int tid = h.item.get_local_id(0); + int icell = h.item.get_global_id(0); + + int t = 0; + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + t = 1; + } + + t = Gpu::blockReduce + (t, Gpu::warpReduce >(), 0, h); + if (tid == 0) { + ntags[bid] = t; + } + }); +#else + amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept + { + int bid = blockIdx.x; + int tid = threadIdx.x; + int icell = blockDim.x*blockIdx.x+threadIdx.x; + + int t = 0; + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + t = 1; + } + + t = Gpu::blockReduce + (t, Gpu::warpReduce >(), 0); + if (tid == 0) { + ntags[bid] = t; + } + }); +#endif } - // - // Local space for holding just those tags we want to gather to the root cpu. - // - Vector TheLocalCollateSpace(count); + PODVector > hv_ntags(ntotblocks); + Gpu::dtoh_memcpy(hv_ntags.data(), dv_ntags.data(), ntotblocks*sizeof(int)); - count = 0; + PODVector > hv_tags_offset(ntotblocks+1); + hv_tags_offset[0] = 0; + std::partial_sum(hv_ntags.begin(), hv_ntags.end(), hv_tags_offset.begin()+1); + int ntotaltags = hv_tags_offset.back(); - // unsafe to do OMP + if (ntotaltags == 0) return; + + PODVector > dv_tags_offset(ntotblocks); + int* dp_tags_offset = dv_tags_offset.data(); + Gpu::htod_memcpy(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int)); +#ifdef AMREX_USE_DPCPP + Gpu::synchronize(); +#endif + + PODVector > dv_tags(ntotaltags); + IntVect* dp_tags = dv_tags.data(); + + int iblock = 0; for (MFIter fai(*this); fai.isValid(); ++fai) { - count += get(fai).collate(TheLocalCollateSpace,count); + const int li = fai.LocalIndex(); + int iblock_begin = iblock; + int iblock_end = iblock + nblocks[li]; + iblock = iblock_end; + int count = 0; + for (int ib = iblock_begin; ib < iblock_end; ++ib) { + count += hv_ntags[ib]; + } + if (count > 0) { + Box const& bx = fai.fabbox(); + const auto lo = amrex::lbound(bx); + const auto len = amrex::length(bx); + const int ncells = bx.numPts(); + const char* tags = (*this)[fai].dataPtr(); +#ifdef AMREX_USE_DPCPP + amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + { + int bid = h.item.get_group(0); + int tid = h.item.get_local_id(0); + int icell = h.item.get_global_id(0); + + unsigned int* shared_counter = (unsigned int*)h.local; + if (tid == 0) { + *shared_counter = 0; + } + h.item.barrier(sycl::access::fence_space::local_space); + + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + unsigned int itag = Gpu::Atomic::Inc + (shared_counter, 20480u); + IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + p[itag] = IntVect(AMREX_D_DECL(i,j,k)); + } + }); +#else + amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept + { + int bid = blockIdx.x; + int tid = threadIdx.x; + int icell = blockDim.x*blockIdx.x+threadIdx.x; + + Gpu::SharedMemory gsm; + unsigned int * shared_counter = gsm.dataPtr(); + if (tid == 0) { + *shared_counter = 0; + } + __syncthreads(); + + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x); + IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + p[itag] = IntVect(AMREX_D_DECL(i,j,k)); + } + }); +#endif + } } - if (count > 0) + v.resize(ntotaltags); + Gpu::dtoh_memcpy(v.data(), dp_tags, ntotaltags*sizeof(IntVect)); +} +#endif + +void +TagBoxArray::collate (Vector& TheGlobalCollateSpace) const +{ + BL_PROFILE("TagBoxArray::collate()"); + + Vector TheLocalCollateSpace; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + local_collate_gpu(TheLocalCollateSpace); + } else +#endif { - amrex::RemoveDuplicates(TheLocalCollateSpace); - count = TheLocalCollateSpace.size(); + local_collate_cpu(TheLocalCollateSpace); } + + Long count = TheLocalCollateSpace.size(); + // // The total number of tags system wide that must be collated. - // This is really just an estimate of the upper bound due to duplicates. - // While we've removed duplicates per MPI process there's still more systemwide. // Long numtags = count; - ParallelDescriptor::ReduceLongSum(numtags); if (numtags == 0) { TheGlobalCollateSpace.clear(); return; + } else if (numtags > static_cast(std::numeric_limits::max())) { + // xxxxx todo + amrex::Abort("TagBoxArray::collate: Too many tags. Using a larger blocking factor might help. Please file an issue on github"); } +#ifdef BL_USE_MPI // - // This holds all tags after they've been gather'd and unique'ified. + // On I/O proc. this holds all tags after they've been gather'd. + // On other procs. non-mempty signals size is not zero. // - // Each CPU needs an identical copy since they all must go through grid_places() which isn't parallelized. - - TheGlobalCollateSpace.resize(numtags); + if (ParallelDescriptor::IOProcessor()) { + TheGlobalCollateSpace.resize(numtags); + } else { + TheGlobalCollateSpace.resize(1); + } -#ifdef BL_USE_MPI // // Tell root CPU how many tags each CPU will be sending. // const int IOProcNumber = ParallelDescriptor::IOProcessorNumber(); - count *= AMREX_SPACEDIM; // Convert from count of tags to count of integers to expect. - const std::vector& countvec = ParallelDescriptor::Gather(count, IOProcNumber); - - std::vector offset(countvec.size(),0L); - if (ParallelDescriptor::IOProcessor()) - { + const std::vector& countvec = ParallelDescriptor::Gather(static_cast(count), + IOProcNumber); + std::vector offset(countvec.size(),0); + if (ParallelDescriptor::IOProcessor()) { for (int i = 1, N = offset.size(); i < N; i++) { offset[i] = offset[i-1] + countvec[i-1]; } @@ -591,93 +633,136 @@ TagBoxArray::collate (Vector& TheGlobalCollateSpace) const // // Gather all the tags to IOProcNumber into TheGlobalCollateSpace. // - BL_ASSERT(sizeof(IntVect) == AMREX_SPACEDIM * sizeof(int)); - const int* psend = (count > 0) ? TheLocalCollateSpace[0].getVect() : 0; - int* precv = TheGlobalCollateSpace[0].getVect(); - ParallelDescriptor::Gatherv(psend, count, - precv, countvec, offset, IOProcNumber); - - if (ParallelDescriptor::IOProcessor()) - { - amrex::RemoveDuplicates(TheGlobalCollateSpace); - numtags = TheGlobalCollateSpace.size(); - } - - // - // Now broadcast them back to the other processors. - // - ParallelDescriptor::Bcast(&numtags, 1, IOProcNumber); - ParallelDescriptor::Bcast(TheGlobalCollateSpace[0].getVect(), numtags*AMREX_SPACEDIM, IOProcNumber); - TheGlobalCollateSpace.resize(numtags); + const IntVect* psend = (count > 0) ? TheLocalCollateSpace.data() : nullptr; + IntVect* precv = TheGlobalCollateSpace.data(); + ParallelDescriptor::Gatherv(psend, count, precv, countvec, offset, IOProcNumber); #else - // - // Copy TheLocalCollateSpace to TheGlobalCollateSpace. - // - TheGlobalCollateSpace = TheLocalCollateSpace; + TheGlobalCollateSpace = std::move(TheLocalCollateSpace); #endif } void -TagBoxArray::setVal (const BoxList& bl, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxList& bl, TagBox::TagVal val) { BoxArray ba(bl); setVal(ba,val); } void -TagBoxArray::setVal (const BoxDomain& bd, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxDomain& bd, TagBox::TagVal val) { setVal(bd.boxList(),val); } void -TagBoxArray::setVal (const BoxArray& ba, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxArray& ba, TagBox::TagVal val) { - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu - + Vector > tags; + bool run_on_gpu = Gpu::inLaunchRegion(); #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (!run_on_gpu) #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) { std::vector< std::pair > isects; - - ba.intersections(mfi.fabbox(),isects); - - TagBox& tags = get(mfi); - - for (int i = 0, N = isects.size(); i < N; i++) + for (MFIter mfi(*this); mfi.isValid(); ++mfi) { - tags.setVal(val,isects[i].second,0); + TagBox& fab = (*this)[mfi]; + Array4 const& arr = this->array(mfi); + ba.intersections(mfi.fabbox(), isects); + for (const auto& is : isects) { + Box const& b = is.second; + if (run_on_gpu) { + tags.push_back({arr,b}); + } else { + fab.setVal(val,b); + } + } } } + +#ifdef AMREX_USE_GPU + amrex::ParallelFor(tags, 1, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int /*n*/, Array4 const& a) noexcept + { + a(i,j,k) = val; + }); +#endif } void TagBoxArray::coarsen (const IntVect & ratio) { - // If team is used, all team workers need to go through all the fabs, including ones they don't own. + // If team is used, all team workers need to go through all the fabs, + // including ones they don't own. int teamsize = ParallelDescriptor::TeamSize(); unsigned char flags = (teamsize == 1) ? 0 : MFIter::AllBoxes; - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + IntVect new_n_grow; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + new_n_grow[idim] = (n_grow[idim]+ratio[idim]-1)/ratio[idim]; + } #if defined(_OPENMP) -#pragma omp parallel if (teamsize == 1) +#pragma omp parallel if (teamsize == 1 && Gpu::notInLaunchRegion()) #endif for (MFIter mfi(*this,flags); mfi.isValid(); ++mfi) { - this->fabPtr(mfi)->coarsen(ratio); + Box const& cbox = amrex::grow(amrex::coarsen(mfi.validbox(),ratio),new_n_grow); + this->fabPtr(mfi)->coarsen(ratio,cbox); } - boxarray.growcoarsen(n_grow,ratio); - updateBDKey(); // because we just modify boxarray in-place. + boxarray.coarsen(ratio); + n_grow = new_n_grow; +} + +bool +TagBoxArray::hasTags (Box const& a_bx) const +{ + bool has_tags = false; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + ReduceOps reduce_op; + ReduceData reduce_data(reduce_op); + using ReduceTuple = typename decltype(reduce_data)::Type; + + for (MFIter mfi(*this); mfi.isValid(); ++mfi) + { + Box const& b = a_bx & mfi.fabbox(); + if (b.ok()) { + const auto& arr = this->const_array(mfi); + reduce_op.eval(b, reduce_data, + [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple + { + int tr = arr(i,j,k) != TagBox::CLEAR; + return {tr}; + }); + } + } + + ReduceTuple hv = reduce_data.value(); + has_tags = static_cast(amrex::get<0>(hv)); + } else +#endif + { +#ifdef _OPENMP +#pragma omp parallel reduction(||:has_tags) +#endif + for (MFIter mfi(*this); mfi.isValid(); ++mfi) + { + Box const& b = a_bx & mfi.fabbox(); + if (b.ok()) { + Array4 const& arr = this->const_array(mfi); + AMREX_LOOP_3D(b, i, j, k, + { + has_tags = has_tags || (arr(i,j,k) != TagBox::CLEAR); + }); + } + } + } - n_grow = IntVect::TheZeroVector(); + ParallelAllReduce::Or(has_tags, ParallelContext::CommunicatorSub()); + return has_tags; } } diff --git a/Src/AmrCore/CMakeLists.txt b/Src/AmrCore/CMakeLists.txt index 833213eb129..cd7ba6c0589 100644 --- a/Src/AmrCore/CMakeLists.txt +++ b/Src/AmrCore/CMakeLists.txt @@ -19,25 +19,25 @@ target_sources(amrex AMReX_Interpolater.H AMReX_TagBox.H AMReX_AmrMesh.H - AMReX_FluxReg_${DIM}D_C.H + AMReX_FluxReg_${AMReX_SPACEDIM}D_C.H AMReX_FluxReg_C.H AMReX_Interp_C.H - AMReX_Interp_${DIM}D_C.H + AMReX_Interp_${AMReX_SPACEDIM}D_C.H ) -if (ENABLE_FORTRAN) +if (AMReX_FORTRAN) target_sources(amrex PRIVATE AMReX_FillPatchUtil_F.H - AMReX_FillPatchUtil_${DIM}d.F90 + AMReX_FillPatchUtil_${AMReX_SPACEDIM}d.F90 AMReX_FLUXREG_F.H AMReX_FLUXREG_nd.F90 AMReX_INTERP_F.H - AMReX_INTERP_${DIM}D.F90 + AMReX_INTERP_${AMReX_SPACEDIM}D.F90 ) endif () # These files are neeeded only if Particles are enabled -if ( ENABLE_PARTICLES ) +if ( AMReX_PARTICLES ) target_sources( amrex PRIVATE AMReX_AmrParGDB.H AMReX_AmrParticles.H ) endif () diff --git a/Src/AmrTask/AMFIter/AMReX_AmrTask.H b/Src/AmrTask/AMFIter/AMReX_AmrTask.H deleted file mode 100644 index 8a7ab9b8344..00000000000 --- a/Src/AmrTask/AMFIter/AMReX_AmrTask.H +++ /dev/null @@ -1,923 +0,0 @@ -#ifndef AMREX_AmrTask_H_ -#define AMREX_AmrTask_H_ - -#include -#include -#include - -#include "AMReX_AbstractTask.H" -#include "AMReX_TaskGraph.H" -#include "RTS.H" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -bool verbose=false; - -namespace amrex { - // These are all ParmParse'd in. Set defaults in Initialize()!!! - extern int plot_nfiles; - extern int mffile_nstreams; - extern int probinit_natonce; - extern bool plot_files_output; - extern int checkpoint_nfiles; - extern int regrid_on_restart; - extern int use_efficient_regrid; - extern int plotfile_on_restart; - extern int checkpoint_on_restart; - extern bool checkpoint_files_output; - extern int compute_new_dt_on_regrid; - extern bool precreateDirectories; - extern bool prereadFAHeaders; - - enum task_states{ - _coarseTimeStep=0, - _post_coarseTimeStep, - _timeStep, - _timeStep_advance, - _post_timeStep, - _final, - }; - - /**** State transition diagrams of tasks on the coarse level (lower part) and fine level (upper part) - * - * _timeStep -> _timeStep_advance -> _post_timestep - * ^\ ^\ | | \\ - * \ \_____________| | \\ - * // \ | \\ (signal) - * // \____________(subcycling)___________________| \\ - * // \\ - * // (fine task) - * ------------------------------------------------------------------------------------------------------------------------------------------- - * // (signal) - * _coarseTimeStep -> _timeStep -------> _timeStep_advance -------------> _post_timestep --------------> _post_coarseTimeStep (coarsest task) - * |^\ ^\ | | - * | \ \_______________| | - * | \ | - * | \ | - * | \ | - * _final \_____________________________________________________________________________________________________________________________________________| - */ - - class AmrTask: public Task - { - LocalConnection l_con; - RemoteConnection r_con; - struct TileArray - { - Vector numLocalTiles; - Vector localTileIndexMap; - Vector tileArray; - } ta; - bool _do_tiling; - int current_tile; - Amr* _amr; - int max_step; - Real stop_time; - - public: - AmrTask(){} - Amr* originalAmr(){return _amr;} - AmrLevel& originalAmrLevel(int lev){return _amr->getLevel(lev);} - - - size_t tagGen(int src, int dest, int amrlevels, int fabs) - { - return (src*fabs + dest%fabs)*amrlevels; - } - - void multifabCopyPush(amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int dstcomp, int srccomp, int nc, int ng, int ngsrc, amrex::FabArrayBase::CPC *cpc) - { - } - - - void FillBoundaryPush(){ - for(int i=0; iSetTag(i); //this is local copy so the recipient will have the matching tag - msg->SetDestRank(ParallelDescriptor::MyProc()); - _fab->copyToMem(l_con.scpy[i].sbx, 0, _mf->nComp(), msg->GetBuffer()); //pack box to the message - _outputs.push(msg);//let the runtime know that the output is available - } - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - for(int i=0; iSetTag(tagGen(r_con.snd[i].ns, r_con.snd[i].nd, 1, _mf->size())); - msg->SetDestRank(r_con.snd[i].pr); - _fab->copyToMem(r_con.snd[i].sbx, 0, _mf->nComp(), msg->GetBuffer()); - _outputs.push(msg); - } - } - - void FillPatchSingleLevelPush (MultiFab& mf, MultiFab *dmf, Real time, - const Vector& smf, const Vector& stime, - int scomp, int dcomp, int ncomp, - const Geometry& geom, PhysBCFunctBase& physbcf) - { - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - if (smf.size() == 1) - { - multifabCopyPush(*smf[0], _lIdx, dcomp, scomp, ncomp, _mf.nGrow(), 0); - } - else if (smf.size() == 2) - { - const Box& bx = validbox(mf); - if (mf.boxArray() == smf[0]->boxArray()) - { - int s_idx = smf[0]->IndexArray()[_lIdx]; - int d_idx = mf.IndexArray()[_lIdx]; - FArrayBox& fab= validFab(*_mf); - - fab.linInterp(smf[0]->get(s_idx), - scomp, - smf[1]->get(s_idx), - scomp, - stime[0], - stime[1], - time, - bx, - dcomp, - ncomp); - FillBoundaryPush(); - }else{ - int s_idx = smf[0]->IndexArray()[_lIdx]; - int d_idx = dmf->IndexArray()[_lIdx]; - FArrayBox& fab= validFab(*dmf); - fab.linInterp(smf[0]->get(s_idx), - scomp, - smf[1]->get(s_idx), - scomp, - stime[0], - stime[1], - time, - bx, - 0, - ncomp); - //multifabCopyPush(dmf, _lIdx, dcomp, 0, ncomp, _mf.nGrow(), 0); - } - } - //physbcf.FillBoundary(mf, dcomp, ncomp, time); - } - - void FillPatchTwoLevelsPush (Real time, - Vector& cmf, const Vector& ct, - Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs) - { - - - - } - - - void InitAmrTask (const void* amr, int m_step, Real st_time){ - assert(_amr); - TaskName name= MyName(); - _taskState= name[0]==0?_coarseTimeStep:_timeStep; - _amr= (Amr*)amr; - max_step= m_step; - stop_time= st_time; - cumtime= _amr->cumtime; - dt_level.resize(_amr->finest_level+1); - dt_min.resize(_amr->finest_level+1); - level_steps.resize(_amr->finest_level+1); - level_count.resize(_amr->finest_level+1); - for (int i = 0; i <= _amr->finest_level; i++){ - dt_level[i] = _amr->dtLevel(i); - dt_min[i] = _amr->dt_min[i]; - level_steps[i] = _amr->level_steps[i]; - level_count[i] = _amr->level_count[i]; - } - which_level_being_advanced= _amr->which_level_being_advanced; - sub_cycle= _amr->sub_cycle; - subcycling_iteration =0; - parent_subcycling_iteration=0; - if(name[0]<_amr->finest_level){ - MultiFab& mfSrc = _amr->amr_level[name[0]]->get_new_data(0); - MultiFab& mfDst = _amr->amr_level[name[0]+1]->get_new_data(0); - const BoxArray& fine_BA = mfDst.boxArray(); - BoxArray crse_S_fine_BA = fine_BA; - crse_S_fine_BA.coarsen(_amr->refRatio(1)); - MultiFab *crse_S_fine = new MultiFab(crse_S_fine_BA, mfDst.DistributionMap(), mfDst.nComp(),0); - TheCPC_sendup= (amrex::FabArrayBase::CPC*)&(crse_S_fine->getCPC(IntVect::TheZeroVector(),, - mfSrc, - IntVect::TheZeroVector(), - Periodicity::NonPeriodic())); - } - if(name[0]>0){ - MultiFab& mfSrc1 = _amr->amr_level[name[0]-1]->get_new_data(0); - MultiFab& mfDst1 = _amr->amr_level[name[0]]->get_new_data(0); - const BoxArray& fine_BA = mfDst1.boxArray(); - BoxArray crse_S_fine_BA = fine_BA; - crse_S_fine_BA.coarsen(_amr->refRatio(1)); - MultiFab *crse_S_fine = new MultiFab(crse_S_fine_BA, mfDst1.DistributionMap(), mfDst1.nComp(),0); - TheCPC_pullup= (amrex::FabArrayBase::CPC*)&(crse_S_fine->getCPC(IntVect::TheZeroVector(), - mfSrc1, - IntVect::TheZeroVector(), - Periodicity::NonPeriodic())); - } - } - void CreateLevelTask(int level){ - TaskName name= MyName(); - levelTask= new AmrLevelTask(this, level); - nStates= _amr->amr_level[level]->numStates(); - state.resize(nStates); - for(int i=0; iamr_level[level]->get_state_data(i)); - } - } - - void Signal(TaskName dest, int tag=0){ - Data* data= new Data(_id, dest, 1); - data->SetTag(tag); - _outputs.push(data); - } - - bool DependSignal(TaskName src, int tag=0){ - bool arrived= Depend_on(src, tag); - if(!arrived) return false; - return true; - } - - void DependSignal_reset(TaskName src, int tag=0){ - Data* msg= _neighbors_in.pop_front(src, tag); - msg->Free(); - } - - void Job(){ - TaskName name= MyName(); - if(name[0]==0){//tasks on the coarsest level - switch(_taskState){ - case _coarseTimeStep: - if ( _amr->okToContinue() && - (levelSteps(0) < max_step || max_step < 0) && - (cumtime < stop_time || stop_time < 0.0) ) - - { - coarseTimeStepTask_init(stop_time); - _taskState= _timeStep; - }else{ - //_final: - coarseTimeStepTask_end(stop_time); - delete levelTask; - SelfDestroy(); - break; - } - case _timeStep: - timeStepTask_init(0,cumtime,1,1,stop_time); - amr_level= &originalAmrLevel(name[0]); - advanceTask_init(cumtime, dt_level[0], 1, stop_time); - cout<<"Task (Level) " <finest_level >0) - { - const int nloc_cpAsc = TheCPC_sendup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - Signal(TaskName(name[0]+1, name[1], tag.dstIndex, name[3]*_amr->n_cycle[1]), 0); - } - } - } - _taskState= _timeStep_advance; - break; - case _timeStep_advance: - if(advanceTask()) - { - dt_new= post_advanceTask(); - _taskState= _post_timeStep; - if (_amr->finest_level >0) - { - for(int d=name[3]*_amr->n_cycle[1]+1; d< (name[3]+1)*_amr->n_cycle[1]; d++){ - const int nloc_cpAsc = TheCPC_sendup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - Signal(TaskName(name[0]+1, name[1], tag.dstIndex, d), 0); - } - } - } - } - }else{ - _taskState= _timeStep_advance; - } - break; - case _post_timeStep: - dt_level[0] = dt_new; - level_steps[0]++; - level_count[0]++; - post_timestepTask(1); - //enable next advance iteration if any - if((name[3]+1)%_amr->n_cycle[0]!=0) Signal(TaskName(name[0], name[1],name[2], name[3]+1), 0); - _taskState= _post_coarseTimeStep; - break; - case _post_coarseTimeStep: - //_amr->amr_level[0]->postCoarseTimeStep(cumtime); - cumtime += dt_level[0]; - _taskState= _coarseTimeStep; - break; - default: - cout<<"Error state"<finest_level){ - int lev= name[0]; - switch(_taskState){ - case _timeStep: - cout<<"Task (Level) " <n_cycle[lev], stop_time); - if (lev < _amr->finest_level) - { - const int nloc_cpAsc = TheCPC_sendup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - Signal(TaskName(name[0]+1, name[1], tag.dstIndex, name[3]*_amr->n_cycle[lev+1]), 0); - } - } - } - _taskState= _timeStep_advance; - //advanceObj= new advanceTask(cumtime+ (subcycling_iteration-1)*dt_level[lev], dt_level[lev], subcycling_iteration, _amr->n_cycle[lev]); - case _timeStep_advance: - if(advanceTask()) - { - dt_new= post_advanceTask(); - _taskState= _post_timeStep; - if (lev < _amr->finest_level) - { - for(int d=name[3]*_amr->n_cycle[lev+1]+1; d< (name[3]+1)*_amr->n_cycle[lev+1]; d++){ - const int nloc_cpAsc = TheCPC_sendup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - Signal(TaskName(name[0]+1, name[1], tag.dstIndex, d), 0); - } - } - } - } - }else{ - _taskState= _timeStep_advance; - //if(name[3]<_amr->n_cycle[lev]-1) Signal(TaskName(name[0], name[1],name[2], name[3]+1), 0); - } - break; - //dt_new= advanceTask(cumtime+ (subcycling_iteration-1)*dt_level[lev], dt_level[lev], subcycling_iteration, _amr->n_cycle[lev]); - //if(advance->state()!=advance->finalState()){ - // delete advanceObj; - // break; //keep doing advance - //} - //_taskState= _post_timeStep; - case _post_timeStep: - cumtime += dt_level[lev]; - dt_level[lev] = dt_new; - level_steps[lev]++; - level_count[lev]++; - post_timestepTask(lev); - if((name[3]+1)%_amr->n_cycle[lev]==0) { - //now we already synced all subcycling iteration before and at lower AMR levels, it's time to enable the parent to move on - const int nloc_cpAsc = TheCPC_pullup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.dstIndex){ - Signal(TaskName(lev-1, name[1], tag.srcIndex, name[3]/_amr->n_cycle[lev]), 0); - } - } - }else{ - //enable next subcycling iteration if any - Signal(TaskName(name[0], name[1],name[2], name[3]+1), 0); - } - SelfDestroy(); - delete levelTask; - break; - //_amr->amr_level[lev]->postTimeStep(cumtime); - default: - cout<<"Error state"<0 && name[0]<=_amr->finest_level){ //execute advance - TaskName parentName(name); - int tag=0; - Data* msg= _neighbors_in.pop_front(TaskName(parentName), tag); - char* pos; - Real time= ((Real*)msg)[0]; - Real dt= ((Real*)msg)[1]; - pos= sizeof(Real)*2; - int iteration= ((int*)pos)[0]; - int ncycle= ((int*)pos)[1]; - advanceTask(time, dt, iteration, ncycle); - cout<<"task "<n_cycle[my_level] !=0){//depend on previous subcycling iteration and the parent - if(DependSignal(TaskName(name[0], name[1], name[2], name[3]-1))==false) return false; - //check if the signal from parent arrived - const int nloc_cpAsc = TheCPC_pullup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.dstIndex){ - if(DependSignal(TaskName(my_level-1, name[1], tag.srcIndex, name[3]/_amr->n_cycle[my_level]))==false) return false; - } - } - //all messages arrived, consume them now - DependSignal_reset(TaskName(name[0], name[1], name[2], name[3]-1)); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.dstIndex){ - DependSignal_reset(TaskName(my_level-1, name[1], tag.srcIndex, name[3]/_amr->n_cycle[my_level])); - } - } - return true; - } - //waiting for signals from tasks at the coarser level - const int nloc_cpAsc = TheCPC_pullup->m_LocTags->size(); - const int nsnds_cpAsc = TheCPC_pullup->m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC_pullup->m_RcvTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.dstIndex){ - if(DependSignal(TaskName(my_level-1, name[1], tag.srcIndex, name[3]/_amr->n_cycle[my_level]), 0)==false) return false; - } - } - - //now all signals have arrived - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.dstIndex){ - DependSignal_reset(TaskName(my_level-1, name[1], tag.srcIndex, name[3]/_amr->n_cycle[my_level]), 0); - } - } - return true; - } - case _timeStep_advance: - return advanceTaskDependency(); - case _post_timeStep: - if(my_level == _amr->finest_level) return true; - else{ - //sync with tasks of the last subcycle iteration - const int nloc_cpAsc = TheCPC_sendup->m_LocTags->size(); - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - if(DependSignal(TaskName(name[0]+1, name[1], tag.dstIndex, (name[3]+1)*_amr->n_cycle[1]-1), 0)==false) return false; - } - } - - for(int i=0; im_LocTags)[i]; - if(name[2] == tag.srcIndex){ - DependSignal_reset(TaskName(name[0]+1, name[1], tag.dstIndex, (name[3]+1)*_amr->n_cycle[1]-1), 0); - } - } - return true; - } - case _post_coarseTimeStep: - return true; - } - } - - void PostCompletion(){ - } - - //! Return the Fab associated with this task - FArrayBox& validFab(const FabArray &mf){ - return *(mf.m_fabs_v[_lIdx]); - } - //! Return the local index of the Fab associated with this task. It can then be used to locate corresponding Fabs on other multifabs that have the same layout. - int localFabIdx(){ - return _lIdx; - } - - void SetFab(FArrayBox* fab){_fab= fab;} - - void SetMF(const FabArray &mf){ - _mf= (FabArray*)&mf; - } - - void SetIdx(int idx){ - _idx=idx; - } - - void SetLocalIdx(int lIdx){_lIdx= lIdx;} - - void enable_tiling(){_do_tiling=true;} - - Box validbox() const{ - return _mf->box(_idx); - } - - Box validbox(const FabArray &mf) const{ - return mf.box(_idx); - } - - void generateTileArray(const IntVect& tileSize){ - IntVect nt_in_fab, tsize, nleft; - int ntiles = 1; - const Box& bx = validbox(); - - for (int d=0; d *_mf; - FArrayBox* _fab; - int _idx; - int _lIdx; - - //! Set the timestep on each level. - void setDtLevelTask (const Vector& dt_lev){ - for (int i = 0; i <= _amr->finest_level; i++) - dt_level[i] = dt_lev[i]; - } - - //! Set the timestep at one level. - void setDtLevelTask (Real dt, int lev){ - dt_level[lev] = dt; - } - - //! Set the dtmin on each level. - void setDtMinTask (const Vector& dt_min_in){ - for (int i = 0; i <= _amr->finest_level; i++) - dt_min[i] = dt_min_in[i]; - } - - /** - * \brief What is "level" in Amr::timeStep? This is only relevant if we are still in Amr::timeStep; - * it is set back to -1 on leaving Amr::timeStep. - */ - int level_being_advancedTask () const { return which_level_being_advanced; } - - int levelSteps (int lev) const { return level_steps[lev]; } - - //! Physical time. - Real cumTimeTask () const { return cumtime; } - - void setCumTimeTask (Real t) {cumtime = t;} - - //! Time step at specified level. - Real dtLevelTask (int level) const { return dt_level[level]; } - - //! Max time step (typically based on physics) at specified level - Real dtMinTask (int level) const { return dt_min[level]; } - - //! Array of time steps at all levels. - const Vector& dtLevelTask () const { return dt_level; } - - void coarseTimeStepTask_init (Real stop_time){ - if (levelSteps(0) > 0) - { - int post_regrid_flag = 0; - _amr->amr_level[0]->computeNewDt(_amr->finest_level, - sub_cycle, - _amr->n_cycle, - _amr->ref_ratio, - _amr->dt_min, - dt_level, - stop_time, - post_regrid_flag); - } - else - { - _amr->amr_level[0]->computeInitialDt(_amr->finest_level, - sub_cycle, - _amr->n_cycle, - _amr->ref_ratio, - dt_level, - stop_time); - } - } - - //postCoarseTimeStepTask(cumtime); - - void coarseTimeStepTask_end(Real stop_time){ - if (verbose > 0) - { - if(isMasterTask()){ - amrex::Print() - << "\nSTEP = " << level_steps[0] - << " TIME = " << cumtime - << " DT = " << dt_level[0] << "\n\n"; - } - } - if (_amr->record_run_info && ParallelDescriptor::IOProcessor()) - { - if(isMasterTask()){ - _amr->runlog << "STEP = " << level_steps[0] - << " TIME = " << cumtime - << " DT = " << dt_level[0] << '\n'; - } - } - if (_amr->record_run_info_terse && ParallelDescriptor::IOProcessor()) - if(isMasterTask()) - _amr->runlog_terse << level_steps[0] << " " << cumtime << " " << dt_level[0] << '\n'; - - int check_test = 0; - - if (_amr->check_per > 0.0) - { - const int num_per_old = (cumtime-_amr->dt_level[0]) / _amr->check_per; - const int num_per_new = (cumtime ) / _amr->check_per; - - if (num_per_old != num_per_new) - { - check_test = 1; - } - } - - int to_stop = 0; - int to_checkpoint = 0; - int to_plot = 0; - if (_amr->message_int > 0 && level_steps[0] % _amr->message_int == 0) { - if(isMasterTask()){ - if (ParallelDescriptor::IOProcessor()) - { - FILE *fp; - if ((fp=fopen("dump_and_continue","r")) != 0) - { - remove("dump_and_continue"); - to_checkpoint = 1; - fclose(fp); - } - else if ((fp=fopen("stop_run","r")) != 0) - { - remove("stop_run"); - to_stop = 1; - fclose(fp); - } - else if ((fp=fopen("dump_and_stop","r")) != 0) - { - remove("dump_and_stop"); - to_checkpoint = 1; - to_stop = 1; - fclose(fp); - } - - if ((fp=fopen("plot_and_continue","r")) != 0) - { - remove("plot_and_continue"); - to_plot = 1; - fclose(fp); - } - } - int packed_data[2]; - packed_data[0] = to_stop; - packed_data[1] = to_checkpoint; - ParallelDescriptor::Bcast(packed_data, 2, ParallelDescriptor::IOProcessorNumber()); - to_stop = packed_data[0]; - to_checkpoint = packed_data[1]; - if(to_stop == 1 && to_checkpoint == 0) { // prevent main from writing files - _amr->last_checkpoint = level_steps[0]; - _amr->last_plotfile = level_steps[0]; - } - - if (to_checkpoint && _amr->write_plotfile_with_checkpoint) - to_plot = 1; - - if ((_amr->check_int > 0 && level_steps[0] % _amr->check_int == 0) || check_test == 1 - || to_checkpoint) - { - _amr->checkPoint(); - } - - if (_amr->writePlotNow() || to_plot) - { - _amr->writePlotFile(); - } - - if (_amr->writeSmallPlotNow()) - { - _amr->writeSmallPlotFile(); - } - _amr->bUserStopRequest = to_stop; - if (to_stop) - { - ParallelDescriptor::Barrier("Amr::coarseTimeStep::to_stop"); - if(ParallelDescriptor::IOProcessor()) { - if (to_checkpoint) - { - std::cerr << "Stopped by user w/ checkpoint" << std::endl; - } - else - { - std::cerr << "Stopped by user w/o checkpoint" << std::endl; - } - } - } - } - } - } - - virtual void advanceTask_init (Real time, Real dt, int iteration, int ncycle) = 0; - virtual bool advanceTask ()=0; - virtual bool advanceTaskDependency (){return true;} - virtual Real post_advanceTask()=0; - - void timeStepTask_init (int level, Real time, int iteration, int niter, Real stop_time){ - which_level_being_advanced = level; - int lev_top = std::min(_amr->finest_level, _amr->max_level-1); - - for (int i(level); i <= lev_top; ++i) - { - const int old_finest = _amr->finest_level; - - if (_amr->okToRegrid(i)) - { - _amr->regrid(i,time); - if (compute_new_dt_on_regrid && (i == 0) ) - { - int post_regrid_flag = 1; - _amr->amr_level[0]->computeNewDt(_amr->finest_level, - sub_cycle, - _amr->n_cycle, - _amr->ref_ratio, - dt_min, - dt_level, - stop_time, - post_regrid_flag); - } - for (int k(i); k <= _amr->finest_level; ++k) { - level_count[k] = 0; - } - if (old_finest < _amr->finest_level) - { - // The new levels will not have valid time steps - // and iteration counts. - for (int k(old_finest + 1); k <= _amr->finest_level; ++k) - { - dt_level[k] = dt_level[k-1]/_amr->n_cycle[k]; - } - } - } - if (old_finest > _amr->finest_level) { - lev_top = std::min(_amr->finest_level, _amr->max_level - 1); - } - } - } - - // Real dt_new = advanceTask(time,dt_level[level],iteration,niter); - -#if 0 - void _postTimeStepTask(int level, Real time, int iteration, int niter, Real stop_time){ - dt_min[level] = iteration == 1 ? dt_new : std::min(dt_min[level],dt_new); - level_steps[level]++; - level_count[level]++; - if (verbose > 0) - { - amrex::Print() << "[Level " << level << " step " << level_steps[level] << "] " - << "Advanced " << _amr->amr_level[level]->countCells() << " cells\n"; - } - - // If the level signified that it wants a regrid after the advance has - // occurred, do that now. - if (_amr->amr_level[level]->postStepRegrid()) { - - int old_finest = _amr->finest_level; - - _amr->regrid(level, time); - - if (old_finest < _amr->finest_level) - { - // The new levels will not have valid time steps. - for (int k = old_finest + 1; k <= _amr->finest_level; ++k) - { - dt_level[k] = dt_level[k-1] / _amr->n_cycle[k]; - } - } - } - - //signal timeStepTask at the next level - if (level < _amr->finest_level) - { - const int lev_fine = level+1; - - if (_amr->sub_cycle) - { - const int ncycle = _amr->n_cycle[lev_fine]; - - for (int i = 1; i <= ncycle; i++) - timeStepTask(lev_fine,time+(i-1)*dt_level[lev_fine],i,ncycle,stop_time); - } - else - { - timeStepTask(lev_fine,time,1,1,stop_time); - } - } - - //_amr->amr_level[level]->post_timestep(iteration); - - // Set this back to negative so we know whether we are in fact in this routine - which_level_being_advanced = -1; - }//end timeStep -#endif - - virtual void post_timestepTask(int iteration)=0; - private: - amrex::FabArrayBase::CPC *TheCPC_sendup; - amrex::FabArrayBase::CPC *TheCPC_pullup; - amrex::FabArrayBase::CPC *TheCPC_senddown; - amrex::FabArrayBase::CPC *TheCPC_pulldown; - - protected: - int subcycling_iteration; - int parent_subcycling_iteration; - int _taskState; - // The data that need to be privatized - Real cumtime; // Physical time variable. - Real dt_new; - Vector dt_level; // Timestep at this level. - Vector dt_min; - Vector level_count; - Vector level_steps; // Number of time steps at this level. - int which_level_being_advanced; // Only >=0 if we are in Amr::timeStep(level,...) - int sub_cycle; - - //for an AMR level - AmrLevel *amr_level; - AmrLevelTask *levelTask; - int nStates; - Vector state; // Array of state data. - MultiFab& get_old_data (int state_indx) { return state[state_indx].oldDataTask(); } - MultiFab& get_new_data (int state_indx) { return state[state_indx].newDataTask(); } - }; -}//end namespace -#endif /*_Amr_H_*/ diff --git a/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.H b/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.H deleted file mode 100644 index 216d9d0aadd..00000000000 --- a/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.H +++ /dev/null @@ -1,594 +0,0 @@ -#ifndef AMREX_ASYNC_MFITER -#define AMREX_ASYNC_MFITER -//Question? email tannguyen@lbl.gov -//Created 09-07-2017 -//Last modification 09-07-2017 - -#include "AMReX_AbstractTask.H" -#include "AMReX_TaskGraph.H" -#include "RTS.H" -#include -#include -#include -#include -#include "AMReX_Connections.H" -#include -#include - -//#ifdef _OPENMP -#include -//#endif - -namespace amrex { - typedef MFIter LocalFabIdx; - class Action :public Task{ - protected: - LocalConnection l_con; - RemoteConnection r_con; - struct TileArray - { - Vector numLocalTiles; - Vector localTileIndexMap; - Vector tileArray; - } ta; - bool _do_tiling; - int current_tile; - - size_t tagGen(int src, int dest, int amrlevels, int fabs) - { - return (src*fabs + dest%fabs)*amrlevels; - } - void FillBoundary_Push(){ - for(int i=0; iSetTag(i); //this is local copy so the recipient will have the matching tag - msg->SetDestRank(ParallelDescriptor::MyProc()); - _fab->copyToMem(l_con.scpy[i].sbx, 0, _mf->nComp(), msg->GetBuffer()); //pack box to the message - _outputs.push(msg);//let the runtime know that the output is available - } - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - for(int i=0; iSetTag(tagGen(r_con.snd[i].ns, r_con.snd[i].nd, 1, _mf->size())); - msg->SetDestRank(r_con.snd[i].pr); - _fab->copyToMem(r_con.snd[i].sbx, 0, _mf->nComp(), msg->GetBuffer()); - _outputs.push(msg); - } - } - void FillBoundary_Pull(){ - for(int i=0; icopyFromMem(l_con.dcpy[i].dbx,0, _mf->nComp(), msg->GetBuffer());//unpack message to box - msg->Free(); - } - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - for(int i=0; isize()); - Data* msg= _neighbors_in.pop_front(TaskName(r_con.rcv[i].ns), tag); - _fab->copyFromMem(r_con.rcv[i].dbx, 0, _mf->nComp(), msg->GetBuffer()); - msg->Free(); - } - } - bool isSatisfied(){ - if(!DependSignal()) return false; //wait for dependent signal if ANY - bool satisfied=true; - for(int i=0; isize())); - if(!satisfied) return false; - } - return true; - } - void extendIters(int extra){_nIters+= extra;} - int _iter, _nIters; - FabArray *_mf; - FArrayBox* _fab; - int _idx; - int _lIdx; - bool _communicateFirstTimeStep; //exchange ghost cells before starting the first time step - bool _communicateUponCompletion; //exchange ghost cells after computing the last time step - public: - LocalConnection& LCon(){return l_con;} - RemoteConnection& RCon(){return r_con;} - Action(){ - _iter=-1; - _nIters=1; - _communicateFirstTimeStep=true; //the default is we exchange ghost cells before the first time step - _communicateUponCompletion=false; - _do_tiling=false; - } - ~Action(){ - free(l_con.scpy); - free(l_con.dcpy); - free(r_con.snd); - free(r_con.rcv); - } - void enable_tiling(){_do_tiling=true;} - vector& getTileArray(){return ta.tileArray;} - void SetFab(FArrayBox* fab){_fab= fab;} - void SetSteps(int nIters){ - assert(nIters>=1); - _nIters= nIters; - } - void SetMF(const FabArray &mf){ - _mf= (FabArray*)&mf; - } - void SetIdx(int idx){ - _idx=idx; - } - void SetLocalIdx(int lIdx){_lIdx= lIdx;} - virtual void Compute(Box)=0; - virtual void Init(){}; - virtual bool DependSignal(){return true;} - - void tileIndices(int &beginIndex, int &endIndex){ -#ifdef _OPENMP - int tid = omp_get_thread_num(); - int nthreads= omp_get_num_threads(); - int ntot= ta.tileArray.size(); - int nr = ntot / nthreads; - int nlft = ntot - nr * nthreads; - if (tid < nlft) { // get nr+1 items - beginIndex += tid * (nr + 1); - endIndex = beginIndex + nr + 1; - } else { // get nr items - beginIndex += tid * nr + nlft; - endIndex = beginIndex + nr; - } -#else - beginIndex=0; - endIndex= ta.tileArray.size(); -#endif - } - void Job(){ - if(_communicateFirstTimeStep){ - if(_iter==-1) FillBoundary_Push(); - else if (_iter==0){ - FillBoundary_Pull(); - } - }else if(_iter==-1) _iter++; //go directly to the first compute step - if(_iter>=0 && _iter<_nIters){ //always compute from time step 0 to _nIters-1 - if(_iter>0)FillBoundary_Pull(); //communication at step 0 is already governed by _communicateFirstTimeStep - if(!_do_tiling) Compute(validbox());//execute task at Fab level - else{ -#ifdef _OPENMP -#pragma omp parallel - { -#endif - int beginIndex, endIndex; - tileIndices(beginIndex, endIndex); - for(int tile=beginIndex; tilebox(_idx); - } - //! Return the Fab associated with this task - FArrayBox& validFab(){ - return *(_mf->m_fabs_v[_lIdx]); - } - //! Return the local index of the Fab associated with this task. It can then be used to locate corresponding Fabs on other multifabs that have the same layout. - int localFabIdx(){ - return _lIdx; - } - //! Locate in multifab mf the Fab that has the same coordinate as this task' Fab - FArrayBox& validFab(const FabArray &mf){ - return *(mf.m_fabs_v[_lIdx]); - } - //! Locate in multifab mf the Fab that has local index lIdx - FArrayBox& validFab(const FabArray &mf, int lIdx){ - return *(mf.m_fabs_v[lIdx]); - } - - void generateTileArray(const IntVect& tileSize){ - IntVect nt_in_fab, tsize, nleft; - int ntiles = 1; - const Box& bx = validbox(); - - for (int d=0; dm_fabs_v[_lIdx]->box(); - for (int d=0; d - class MFGraph: public AbstractTaskGraph{ - protected: - string _graphName; - bool _do_tiling; - public: - MFGraph(const FabArray &mf, int nSteps, int rank, int nProcs, Periodicity period, bool do_tiling){ - AbstractTaskGraph::_nProcs= nProcs; - AbstractTaskGraph::_rank= rank; - for(int i=0; i< mf.local_size(); i++){ - TaskName name(mf.IndexArray()[i]); - T *t= new T(); - t->SetSteps(nSteps); - t->SetMF(mf); - t->SetFab(mf.m_fabs_v[i]); - t->SetIdx(mf.IndexArray()[i]); - t->SetName(name); - t->SetLocalIdx(i); - if(do_tiling){ - IntVect ts= FabArrayBase::mfiter_tile_size; - t->generateTileArray(ts); //create tile array associated with this FAB - } - t->Init(); - if(do_tiling) t->enable_tiling(); - AbstractTaskGraph::_initialTasks.push_back(t); - AbstractTaskGraph::_taskPool[name]= t; - } - AbstractTaskGraph::_begin= *(AbstractTaskGraph::_initialTasks.begin()); - AbstractTaskGraph::_end= *(AbstractTaskGraph::_initialTasks.end()); - AbstractTaskGraph::_currIt= AbstractTaskGraph::_initialTasks.begin(); - AbstractTaskGraph::_current= *(AbstractTaskGraph::_currIt); - AbstractTaskGraph::_mode= _Push; - SetupFabConnections(mf, period); - _do_tiling= do_tiling; - } - - MFGraph(const Amr* amr, int max_step, Real stop_time, int rank, int nProcs, bool do_tiling){ - AbstractTaskGraph::_nProcs= nProcs; - AbstractTaskGraph::_rank= rank; - //create an initial graph corresponding to the coarsest AMR level, this graph will evolve with time - int n_intervals=1; - for(int l=0; l<= amr->finest_level; l++){ - n_intervals*= amr->n_cycle[l]; - for(int s=0; s< amr->amr_level[l]->numStates(); s++){ - MultiFab& mf = amr->amr_level[l]->get_new_data(s); - amr->amr_level[l]->get_state_data(s).allocOldData(); - for(int i=0; i< mf.local_size(); i++){ - for(int it=0; it< n_intervals; it++){ - TaskName name(l /*level*/, s/*state*/, mf.IndexArray()[i] /*fab*/, it); - T *t= new T(); - t->SetName(name); - t->SetLocalIdx(i); - t->SetMF(mf); - t->SetFab(mf.m_fabs_v[i]); - t->SetIdx(mf.IndexArray()[i]); - t->InitAmrTask(amr, max_step, stop_time); - t->CreateLevelTask(l); - if(i==0) t->SetMaster(); - if(do_tiling){ - IntVect ts= FabArrayBase::mfiter_tile_size; - t->generateTileArray(ts); //create tile array associated with this FAB - } - if(do_tiling) t->enable_tiling(); - AbstractTaskGraph::_initialTasks.push_back(t); - AbstractTaskGraph::_taskPool[name]= t; - SetupFabConnections(mf, amr->amr_level[l]->Geom().periodicity()); - } - } - } - } - AbstractTaskGraph::_begin= *(AbstractTaskGraph::_initialTasks.begin()); - AbstractTaskGraph::_end= *(AbstractTaskGraph::_initialTasks.end()); - AbstractTaskGraph::_currIt= AbstractTaskGraph::_initialTasks.begin(); - AbstractTaskGraph::_current= *(AbstractTaskGraph::_currIt); - AbstractTaskGraph::_mode= _Push; - _do_tiling= do_tiling; - } - - int FindProcessAssociation(TaskName name){ - assert(false); - } - void SetupFabConnections(const FabArray &mf, Periodicity period){ - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = mf.size(); - bool cross = false; - const FabArrayBase::FB& TheFB = mf.getFB(mf.nGrowVect(),period); - const int n_loc_mf = TheFB.m_LocTags->size(); - const int n_snds_mf = TheFB.m_SndTags->size(); - const int n_rcvs_mf = TheFB.m_RcvTags->size(); - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(n_snds_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_SndTags->begin(), - m_End = TheFB.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - Vector recv_cctc; - Vector recv_pr; - recv_cctc.reserve(n_rcvs_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_RcvTags->begin(), - m_End = TheFB.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - for(int f=0; f::_initialTasks[f])->LCon(); - l_con.nscpy = 0; - l_con.ndcpy = 0; - l_con.firingRuleCnt = 0; - for(int i=0; i 1) - { - for(int f=0; f::_initialTasks[f])->RCon(); - r_con.nrcv = 0; - r_con.nsnd = 0; - r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - r_con.nsnd++; - } - } - for(int i=0; idstIndex) - r_con.nrcv++; - } - } - } - } - int scnt, dcnt; - for(int f=0; f::_initialTasks[f])->LCon(); - l_con.scpy = new LocalCopyDescriptor[l_con.nscpy]; - l_con.dcpy = new LocalCopyDescriptor[l_con.ndcpy]; - for(int i=0; i::_initialTasks[f])->LCon(); - for(int i=0; i::_initialTasks[l_con.scpy[i].nd])->LCon(); - for(int j=0; j::_initialTasks[l_con.dcpy[i].ns])->LCon(); - for(int j=0; j::_initialTasks[f])->RCon(); - r_con.snd = new RemoteCommDescriptor[r_con.nsnd]; - r_con.rcv = new RemoteCommDescriptor[r_con.nrcv]; - nrcv= -1; - for(int i=0; idstIndex) - { - nrcv++; - r_con.rcv[nrcv].ns = it->srcIndex; - r_con.rcv[nrcv].lns = -1; - r_con.rcv[nrcv].nd = it->dstIndex; - r_con.rcv[nrcv].lnd = mf.localindex(it->dstIndex); - r_con.rcv[nrcv].sbx = it->sbox; - r_con.rcv[nrcv].dbx = it->dbox; - r_con.rcv[nrcv].pr = pr; - r_con.rcv[nrcv].cnt = 0; - r_con.rcv[nrcv].sz = it->sbox.numPts() * mf.nComp() * sizeof(double); - } - } - } - nsnd = -1; - for(int i=0; idstIndex ) - { - nsnd++; - r_con.snd[nsnd].ns = it->srcIndex; - r_con.snd[nsnd].lns = mf.localindex(it->srcIndex); - r_con.snd[nsnd].nd = it->dstIndex; - r_con.snd[nsnd].lnd = -1; - r_con.snd[nsnd].sbx = it->sbox; - r_con.snd[nsnd].dbx = it->dbox; - r_con.snd[nsnd].pr = pr; - r_con.snd[nsnd].cnt = 0; - r_con.snd[nsnd].sz = it->dbox.numPts() * mf.nComp() * sizeof(double); - } - } - } // for(i -#define max_life 2^32-1 - class AMFIter{ - private: - RTS rts; - MFGraph *graph; - bool _do_tiling; - public: - AMFIter(const FabArray &mf, int nSteps, Periodicity period, bool do_tiling=false){ - _do_tiling= do_tiling; - graph= new MFGraph(mf, nSteps, ParallelDescriptor::MyProc(), ParallelDescriptor::NProcs(), period, do_tiling); - } - - AMFIter(const FabArray &mf, Periodicity period, bool do_tiling=false){ - _do_tiling= do_tiling; - graph= new MFGraph(mf, max_life, ParallelDescriptor::MyProc(), ParallelDescriptor::NProcs(), period, do_tiling); - } - - AMFIter(const Amr* amr, int max_step, Real stop_time, bool do_tiling=false){ - _do_tiling= do_tiling; - graph= new MFGraph(amr, max_step, stop_time, ParallelDescriptor::MyProc(), ParallelDescriptor::NProcs(), do_tiling); - } - - ~AMFIter(){ - delete graph; - } - - void Iterate(){ - rts.Init(ParallelDescriptor::MyProc(), ParallelDescriptor::NProcs()); - rts.Iterate(graph); - rts.Finalize(); - } - }; -#undef max_life -} //end namespace -#endif diff --git a/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.cpp b/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.cpp deleted file mode 100644 index ebdb948f4d1..00000000000 --- a/Src/AmrTask/AMFIter/AMReX_AsyncMFIter.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#include "AMReX_AsyncMFIter.H" - -namespace amrex { -} diff --git a/Src/AmrTask/AMFIter/AMReX_Connections.H b/Src/AmrTask/AMFIter/AMReX_Connections.H deleted file mode 100644 index e9f6426862a..00000000000 --- a/Src/AmrTask/AMFIter/AMReX_Connections.H +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef AMREX_CONNECTIONS_H -#define AMREX_CONNECTION_H - -#include -#include -#include - -namespace amrex { - - struct LocalCopyDescriptor - { - int ns; // ! Source box in layout - int nd; //! Destination box in layout - Box sbx; // ! Sub-box for this copy - Box dbx; // ! Sub-box for this copy - size_t sz; - int sPartner, dPartner; - int dcpyCnt, scpyCnt; - }; - - struct LocalConnection - { - int nscpy; //! Number of cpy chunks - int ndcpy; //! Number of cpy chunks - int firingRuleCnt; - LocalCopyDescriptor *scpy; - LocalCopyDescriptor *dcpy; - }; - - struct RemoteCommDescriptor - { - int ns, lns; // ! Source box in layout - int nd, lnd; //! Destination box in layout - size_t sz; - int pr; - Box sbx; // ! Sub-box for this copy - Box dbx; // ! Sub-box for this copy - int cnt; - }; - - struct TransDescriptor - { - int sz, pv, pr; - }; - - struct RemoteConnection - { - int nsnd; - int nrcv; - int nrp, nsp; - int firingRuleCnt; - RemoteCommDescriptor *snd; - RemoteCommDescriptor *rcv; - TransDescriptor *str; - TransDescriptor *rtr; - }; -}//end namespace -#endif - diff --git a/Src/AmrTask/AMFIter/Makefile b/Src/AmrTask/AMFIter/Makefile deleted file mode 100755 index 5eda141c3f8..00000000000 --- a/Src/AmrTask/AMFIter/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -include ../arch.common - -OBJECTS= AMReX_AsyncMFIter.o - -AMFIterLIB= AMReX_AsyncMFIter.a - -all: $(AMFIterLIB) - -$(AMFIterLIB): $(OBJECTS) - ar rv $(AMFIterLIB) $(OBJECTS) - - -INCLUDE += -DBL_USE_MPI -DBL_USE_OMP -DBL_SPACEDIM=3 -DAMREX_SPACEDIM=3 -DBL_FORT_USE_UNDERSCORE -DBL_Linux - -all: $(OBJECTS) - -AMReX_AsyncMFIter.o: AMReX_AsyncMFIter.cpp AMReX_AsyncMFIter.H - $(C++) $(C++FLAGS) -I./ -I../../Base -I../../Amr -I../../AmrCore -I../graph -I$(INCLUDE) -c AMReX_AsyncMFIter.cpp -o AMReX_AsyncMFIter.o - -.PHONY: clean - -clean: - $(RM) $(OBJECTS) - $(RM) *.a - diff --git a/Src/AmrTask/Amr/AMReX_AmrLevelTask.H b/Src/AmrTask/Amr/AMReX_AmrLevelTask.H deleted file mode 100644 index 9a0d6581337..00000000000 --- a/Src/AmrTask/Amr/AMReX_AmrLevelTask.H +++ /dev/null @@ -1,291 +0,0 @@ -#ifndef AMREX_AmrLevelAsync_H_ -#define AMREX_AmrLevelAsync_H_ - -#include "AMReX_AmrLevel.H" - -namespace amrex { - - class AmrLevelAsync: public AmrLevel - { - public: - friend class AsyncFillPatchIterator; - virtual void initPerilla (Real time) = 0; - virtual void finalizePerilla (Real time) = 0; - }; - - class AsyncFillPatchIterator :public MFIter - { - public: - - friend class AmrLevelAsync; - friend class RGIter; - - AsyncFillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int iter, - int tid); - - void initFillPatch(int boxGrow, - int time, - int index, - int scomp, - int ncomp, - int iter); - static void initialSend(amrex::Vector afpi, - amrex::Vector upper_afpi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int tid); - - void PushOnly (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - unsigned char pushLevel, - int tf, - bool singleT=false); - - void SendIntraLevel (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - int tid, - bool singleT=false); - - void SendIntraLevel (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - int tid, - bool singleT=false); - - void SendInterLevel (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - int tid, - bool singleT=false); - - void SendInterLevel (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int iter, - int f, - int tid, - bool singleT=false); - - void Receive (RGIter& rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - - void Receive (RGIter* rgi, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - - void Receive (RGIter& rgi, - MultiFab& dest, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - - void Receive (RGIter* rgi, - MultiFab& dest, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - - void PullOnly (int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - - void PullOnly (MultiFab& dest, - int boxGrow, - Real time, - int state_indx, - int scomp, - int ncomp, - int f, - int tid, - bool singleT=false); - void FillFromTwoLevelsPush (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - int tid, - unsigned char pushLevel, - int tf, - bool singleT); - void FillFromTwoLevelsPull (Real time, - int index, - int scomp, - int dcomp, - int ncomp, - int f, - int tid, - bool singleT); - - ~AsyncFillPatchIterator (); - - FArrayBox& operator() () { return m_fabs[MFIter::index()]; } - - Box UngrownBox () const { return MFIter::validbox(); } - - MultiFab& get_mf() { return m_fabs; } - AsyncFillPatchIterator (); - AsyncFillPatchIterator (const AsyncFillPatchIterator& rhs); - AsyncFillPatchIterator& operator= (const AsyncFillPatchIterator& rhs); - - AmrLevel& m_amrlevel; - MultiFab& m_leveldata; - std::vector< std::pair > m_range; - MultiFab m_fabs; - int m_ncomp; - - public: - bool isProperlyNested; - - Vector smf; - Vector stime; - StateDataPhysBCFunct* physbcf; - Geometry* geom; - - - Vector smf_crse; - Vector stime_crse; - StateDataPhysBCFunct* physbcf_crse; - Geometry* geom_crse; - - Vector smf_fine; - Vector stime_fine; - StateDataPhysBCFunct* physbcf_fine; - Geometry* geom_fine; - - - RegionGraph* destGraph; - RegionGraph* csrcGraph; - RegionGraph* fsrcGraph; - - MultiFab* m_mf_crse_patch; - RegionGraph* m_rg_crse_patch; - const FabArrayBase::FPinfo* m_fpc; - - //PArray raii; - MultiFab* dmf; - MultiFab* dmff; - - void completeRegionGraphs(int tg) - { - //std::cout << "Completing RGs "; - - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->completeRegionGraph(tg); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->completeRegionGraph(tg); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->completeRegionGraph(tg); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->completeRegionGraph(tg); - } - - //std::cout <<" by tg " << tg << std::endl; - } - void Reset(int tg) - { - //std::cout << "Resetting RGs "; - if(destGraph != NULL) - { - //std::cout << destGraph->graphID << " "; - destGraph->Reset(tg); - } - if(csrcGraph != NULL) - { - //std::cout << csrcGraph->graphID << " "; - csrcGraph->Reset(tg); - } - if(fsrcGraph != NULL) - { - //std::cout << fsrcGraph->graphID << " "; - fsrcGraph->Reset(tg); - } - if(m_rg_crse_patch != NULL) - { - //std::cout << m_rg_crse_patch->graphID << " "; - m_rg_crse_patch->Reset(tg); - } - //std::cout <<" by tg " << tg << std::endl; - } - - // Variables for optimization calls of two level push/pulll - - }; - - -#endif diff --git a/Src/AmrTask/Amr/AMReX_AmrLevelTask.cpp b/Src/AmrTask/Amr/AMReX_AmrLevelTask.cpp deleted file mode 100644 index 4f8cda112c4..00000000000 --- a/Src/AmrTask/Amr/AMReX_AmrLevelTask.cpp +++ /dev/null @@ -1,2075 +0,0 @@ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef AMREX_USE_EB -#include -#include -#endif - -namespace amrex { - -#ifdef AMREX_USE_EB -int AmrLevel::m_eb_basic_grow_cells = 5; -int AmrLevel::m_eb_volume_grow_cells = 4; -int AmrLevel::m_eb_full_grow_cells = 2; -EBSupport AmrLevel::m_eb_support_level = EBSupport::volume; -#endif - -DescriptorList AmrLevel::desc_lst; -DeriveList AmrLevel::derive_lst; - -void -AmrLevel::postCoarseTimeStep (Real time) -{ - BL_ASSERT(level == 0); - // sync up statedata time - for (int lev = 0; lev <= parent->finestLevel(); ++lev) { - AmrLevel& amrlevel = parent->getLevel(lev); - for (int i = 0; i < amrlevel.state.size(); ++i) { - amrlevel.state[i].syncNewTimeLevel(time); - } - } -} - -void -AmrLevel::set_preferred_boundary_values (MultiFab& S, - int state_index, - int scomp, - int dcomp, - int ncomp, - Real time) const -{} - -DeriveList& -AmrLevel::get_derive_lst () -{ - return derive_lst; -} - -void -AmrLevel::manual_tags_placement (TagBoxArray& tags, - const Vector& bf_lev) -{} - -AmrLevel::AmrLevel () -{ - parent = 0; - level = -1; -} - -AmrLevel::AmrLevel (Amr& papa, - int lev, - const Geometry& level_geom, - const BoxArray& ba, - const DistributionMapping& dm, - Real time) - : - geom(level_geom), - grids(ba), - dmap(dm) -{ - BL_PROFILE("AmrLevel::AmrLevel(dm)"); - level = lev; - parent = &papa; - - fine_ratio = IntVect::TheUnitVector(); fine_ratio.scale(-1); - crse_ratio = IntVect::TheUnitVector(); crse_ratio.scale(-1); - - if (level > 0) - { - crse_ratio = parent->refRatio(level-1); - } - if (level < parent->maxLevel()) - { - fine_ratio = parent->refRatio(level); - } - - state.resize(desc_lst.size()); - -#ifdef AMREX_USE_EB - m_factory.reset(new EBFArrayBoxFactory(geom, ba, dm, - {m_eb_basic_grow_cells, m_eb_volume_grow_cells, m_eb_full_grow_cells}, - m_eb_support_level)); -#else - m_factory.reset(new FArrayBoxFactory()); -#endif - - // Note that this creates a distribution map associated with grids. - for (int i = 0; i < state.size(); i++) - { - state[i].define(geom.Domain(), - grids, - dm, - desc_lst[i], - time, - parent->dtLevel(lev), - *m_factory); - } - - if (parent->useFixedCoarseGrids()) constructAreaNotToTag(); - - post_step_regrid = 0; - - finishConstructor(); -} - -void -AmrLevel::writePlotFile (const std::string& dir, - std::ostream& os, - VisMF::How how) -{ - int i, n; - // - // The list of indices of State to write to plotfile. - // first component of pair is state_type, - // second component of pair is component # within the state_type - // - std::vector > plot_var_map; - for (int typ = 0; typ < desc_lst.size(); typ++) - for (int comp = 0; comp < desc_lst[typ].nComp();comp++) - if (parent->isStatePlotVar(desc_lst[typ].name(comp)) && - desc_lst[typ].getType() == IndexType::TheCellType()) - plot_var_map.push_back(std::pair(typ,comp)); - - int n_data_items = plot_var_map.size(); - - // get the time from the first State_Type - // if the State_Type is ::Interval, this will get t^{n+1/2} instead of t^n - Real cur_time = state[0].curTime(); - - if (level == 0 && ParallelDescriptor::IOProcessor()) - { - // - // The first thing we write out is the plotfile type. - // - os << thePlotFileType() << '\n'; - - if (n_data_items == 0) - amrex::Error("Must specify at least one valid data item to plot"); - - os << n_data_items << '\n'; - - // - // Names of variables - // - for (i =0; i < static_cast(plot_var_map.size()); i++) - { - int typ = plot_var_map[i].first; - int comp = plot_var_map[i].second; - os << desc_lst[typ].name(comp) << '\n'; - } - - os << BL_SPACEDIM << '\n'; - os << parent->cumTime() << '\n'; - int f_lev = parent->finestLevel(); - os << f_lev << '\n'; - for (i = 0; i < BL_SPACEDIM; i++) - os << Geom().ProbLo(i) << ' '; - os << '\n'; - for (i = 0; i < BL_SPACEDIM; i++) - os << Geom().ProbHi(i) << ' '; - os << '\n'; - for (i = 0; i < f_lev; i++) - os << parent->refRatio(i)[0] << ' '; - os << '\n'; - for (i = 0; i <= f_lev; i++) - os << parent->Geom(i).Domain() << ' '; - os << '\n'; - for (i = 0; i <= f_lev; i++) - os << parent->levelSteps(i) << ' '; - os << '\n'; - for (i = 0; i <= f_lev; i++) - { - for (int k = 0; k < BL_SPACEDIM; k++) - os << parent->Geom(i).CellSize()[k] << ' '; - os << '\n'; - } - os << (int) Geom().Coord() << '\n'; - os << "0\n"; // Write bndry data. - - } - // Build the directory to hold the MultiFab at this level. - // The name is relative to the directory containing the Header file. - // - static const std::string BaseName = "/Cell"; - char buf[64]; - sprintf(buf, "Level_%d", level); - std::string sLevel = buf; - // - // Now for the full pathname of that directory. - // - std::string FullPath = dir; - if (!FullPath.empty() && FullPath[FullPath.size()-1] != '/') - FullPath += '/'; - FullPath += sLevel; - // - // Only the I/O processor makes the directory if it doesn't already exist. - // - if (ParallelDescriptor::IOProcessor()) - if (!amrex::UtilCreateDirectory(FullPath, 0755)) - amrex::CreateDirectoryFailed(FullPath); - // - // Force other processors to wait till directory is built. - // - ParallelDescriptor::Barrier(); - - if (ParallelDescriptor::IOProcessor()) - { - os << level << ' ' << grids.size() << ' ' << cur_time << '\n'; - os << parent->levelSteps(level) << '\n'; - - for (i = 0; i < grids.size(); ++i) - { - RealBox gridloc = RealBox(grids[i],geom.CellSize(),geom.ProbLo()); - for (n = 0; n < BL_SPACEDIM; n++) - os << gridloc.lo(n) << ' ' << gridloc.hi(n) << '\n'; - } - // - // The full relative pathname of the MultiFabs at this level. - // The name is relative to the Header file containing this name. - // It's the name that gets written into the Header. - // - if (n_data_items > 0) - { - std::string PathNameInHeader = sLevel; - PathNameInHeader += BaseName; - os << PathNameInHeader << '\n'; - } - } - // - // We combine all of the multifabs -- state, derived, etc -- into one - // multifab -- plotMF. - // NOTE: In this tutorial code, there is no derived data - int cnt = 0; - const int nGrow = 0; - MultiFab plotMF(grids,dmap,n_data_items,nGrow,MFInfo(),Factory()); - MultiFab* this_dat = 0; - // - // Cull data from state variables -- use no ghost cells. - // - for (i = 0; i < static_cast(plot_var_map.size()); i++) - { - int typ = plot_var_map[i].first; - int comp = plot_var_map[i].second; - this_dat = &state[typ].newData(); - MultiFab::Copy(plotMF,*this_dat,comp,cnt,1,nGrow); - cnt++; - } - - // - // Use the Full pathname when naming the MultiFab. - // - std::string TheFullPath = FullPath; - TheFullPath += BaseName; - VisMF::Write(plotMF,TheFullPath,how,true); -} - - -void -AmrLevel::restart (Amr& papa, - std::istream& is, - bool bReadSpecial) -{ - BL_PROFILE("AmrLevel::restart()"); - parent = &papa; - - is >> level; - is >> geom; - - fine_ratio = IntVect::TheUnitVector(); fine_ratio.scale(-1); - crse_ratio = IntVect::TheUnitVector(); crse_ratio.scale(-1); - - if (level > 0) - { - crse_ratio = parent->refRatio(level-1); - } - if (level < parent->maxLevel()) - { - fine_ratio = parent->refRatio(level); - } - - if (bReadSpecial) - { - amrex::readBoxArray(grids, is, bReadSpecial); - } - else - { - grids.readFrom(is); - } - - int nstate; - is >> nstate; - int ndesc = desc_lst.size(); - - Vector state_in_checkpoint(ndesc, 1); - if (ndesc > nstate) { - set_state_in_checkpoint(state_in_checkpoint); - } else { - BL_ASSERT(nstate == ndesc); - } - - dmap.define(grids); - - parent->SetBoxArray(level, grids); - parent->SetDistributionMap(level, dmap); - -#ifdef AMREX_USE_EB - m_factory.reset(new EBFArrayBoxFactory(geom, grids, dmap, - {m_eb_basic_grow_cells, m_eb_volume_grow_cells, m_eb_full_grow_cells}, - m_eb_support_level)); -#else - m_factory.reset(new FArrayBoxFactory()); -#endif - - state.resize(ndesc); - for (int i = 0; i < ndesc; ++i) - { - if (state_in_checkpoint[i]) { - state[i].restart(is, geom.Domain(), grids, dmap, *m_factory, - desc_lst[i], papa.theRestartFile()); - } - } - - if (parent->useFixedCoarseGrids()) constructAreaNotToTag(); - - post_step_regrid = 0; - - finishConstructor(); -} - -void -AmrLevel::set_state_in_checkpoint (Vector& state_in_checkpoint) -{ - amrex::Error("Class derived AmrLevel has to handle this!"); -} - -void -AmrLevel::finishConstructor () {} - -void -AmrLevel::setTimeLevel (Real time, - Real dt_old, - Real dt_new) -{ - for (int k = 0; k < desc_lst.size(); k++) - { - state[k].setTimeLevel(time,dt_old,dt_new); - } -} - -bool -AmrLevel::isStateVariable (const std::string& name, - int& typ, - int& n) -{ - for (typ = 0; typ < desc_lst.size(); typ++) - { - const StateDescriptor& desc = desc_lst[typ]; - - for (n = 0; n < desc.nComp(); n++) - { - if (desc.name(n) == name) - return true; - } - } - return false; -} - -long -AmrLevel::countCells () const -{ - const int N = grids.size(); - - long cnt = 0; - -#ifdef _OPENMP -#pragma omp parallel for reduction(+:cnt) -#endif - for (int i = 0; i < N; i++) - { - cnt += grids[i].numPts(); - } - - return cnt; -} - -void -AmrLevel::checkPoint (const std::string& dir, - std::ostream& os, - VisMF::How how, - bool dump_old) -{ - BL_PROFILE("AmrLevel::checkPoint()"); - int ndesc = desc_lst.size(), i; - // - // Build directory to hold the MultiFabs in the StateData at this level. - // The directory is relative the the directory containing the Header file. - // - std::string LevelDir, FullPath; - LevelDirectoryNames(dir, LevelDir, FullPath); - if( ! levelDirectoryCreated) { - CreateLevelDirectory(dir); - // ---- Force other processors to wait until directory is built. - ParallelDescriptor::Barrier("AmrLevel::checkPoint::dir"); - } - - if (ParallelDescriptor::IOProcessor()) - { - os << level << '\n' << geom << '\n'; - grids.writeOn(os); - os << ndesc << '\n'; - } - // - // Output state data. - // - - for (i = 0; i < ndesc; i++) - { - // - // Now build the full relative pathname of the StateData. - // The name is relative to the Header file containing this name. - // It's the name that gets written into the Header. - // - std::string PathNameInHdr = amrex::Concatenate(LevelDir + "/SD_", i, 1); - std::string FullPathName = amrex::Concatenate(FullPath + "/SD_", i, 1); - - state[i].checkPoint(PathNameInHdr, FullPathName, os, how, dump_old); - } - - levelDirectoryCreated = false; // ---- now that the checkpoint is finished -} - -AmrLevel::~AmrLevel () -{ - parent = 0; -} - -void -AmrLevel::allocOldData () -{ - for (int i = 0; i < desc_lst.size(); i++) - { - state[i].allocOldData(); - } -} - -void -AmrLevel::removeOldData () -{ - for (int i = 0; i < desc_lst.size(); i++) - { - state[i].removeOldData(); - } -} - -void -AmrLevel::reset () -{ - for (int i = 0; i < desc_lst.size(); i++) - { - state[i].reset(); - } -} - -MultiFab& -AmrLevel::get_data (int state_indx, - Real time) -{ - const Real old_time = state[state_indx].prevTime(); - const Real new_time = state[state_indx].curTime(); - const Real eps = 0.001*(new_time - old_time); - - if (time > old_time-eps && time < old_time+eps) - { - return get_old_data(state_indx); - } - else if (time > new_time-eps && time < new_time+eps) - { - return get_new_data(state_indx); - } - - amrex::Error("get_data: invalid time"); - static MultiFab bogus; - return bogus; -} - -const BoxArray& -AmrLevel::getEdgeBoxArray (int dir) const -{ - BL_ASSERT(dir >=0 && dir < BL_SPACEDIM); - if (edge_grids[dir].empty()) { - edge_grids[dir] = grids; - edge_grids[dir].surroundingNodes(dir); - } - return edge_grids[dir]; -} - -const BoxArray& -AmrLevel::getNodalBoxArray () const -{ - if (nodal_grids.empty()) { - nodal_grids = grids; - nodal_grids.surroundingNodes(); - } - return nodal_grids; -} - -void -AmrLevel::setPhysBoundaryValues (FArrayBox& dest, - int state_indx, - Real time, - int dest_comp, - int src_comp, - int num_comp) -{ - state[state_indx].FillBoundary(dest,time,geom.CellSize(), - geom.ProbDomain(),dest_comp,src_comp,num_comp); -} - -FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel, - MultiFab& leveldata) - : - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_mfid(m_amrlevel.level+1) -{} - -FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata) - : - MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_ncomp(0) -{} - -FillPatchIteratorHelper::FillPatchIteratorHelper (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - Interpolater* mapper) - : - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_mfid(m_amrlevel.level+1), - m_time(time), - m_growsize(boxGrow), - m_index(index), - m_scomp(scomp), - m_ncomp(ncomp) -{ - Initialize(boxGrow,time,index,scomp,ncomp,mapper); -} - -FillPatchIterator::FillPatchIterator (AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int idx, - int scomp, - int ncomp) - : - MFIter(leveldata), - m_amrlevel(amrlevel), - m_leveldata(leveldata), - m_ncomp(ncomp) -{ - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(AmrLevel::desc_lst[idx].inRange(scomp,ncomp)); - BL_ASSERT(0 <= idx && idx < AmrLevel::desc_lst.size()); - - Initialize(boxGrow,time,idx,scomp,ncomp); - -#ifdef BL_USE_TEAM - ParallelDescriptor::MyTeam().MemoryBarrier(); -#endif -} - -static -bool -NeedToTouchUpPhysCorners (const Geometry& geom) -{ - return geom.isAnyPeriodic() && !geom.isAllPeriodic(); -} - -void -FillPatchIteratorHelper::Initialize (int boxGrow, - Real time, - int idx, - int scomp, - int ncomp, - Interpolater* mapper) -{ - BL_PROFILE("FillPatchIteratorHelper::Initialize()"); - - BL_ASSERT(mapper); - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(AmrLevel::desc_lst[idx].inRange(scomp,ncomp)); - BL_ASSERT(0 <= idx && idx < AmrLevel::desc_lst.size()); - - m_map = mapper; - m_time = time; - m_growsize = boxGrow; - m_index = idx; - m_scomp = scomp; - m_ncomp = ncomp; - m_FixUpCorners = NeedToTouchUpPhysCorners(m_amrlevel.geom); - - const int MyProc = ParallelDescriptor::MyProc(); - auto& amrLevels = m_amrlevel.parent->getAmrLevels(); - const AmrLevel& topLevel = *amrLevels[m_amrlevel.level]; - const Box& topPDomain = topLevel.state[m_index].getDomain(); - const IndexType& boxType = m_leveldata.boxArray().ixType(); - const bool extrap = AmrLevel::desc_lst[m_index].extrap(); - // - // Check that the interpolaters are identical. - // - BL_ASSERT(AmrLevel::desc_lst[m_index].identicalInterps(scomp,ncomp)); - - for (int l = 0; l <= m_amrlevel.level; ++l) - { - amrLevels[l]->state[m_index].RegisterData(m_mfcd, m_mfid[l]); - } - for (int i = 0, N = m_leveldata.boxArray().size(); i < N; ++i) - { - // - // A couple typedefs we'll use in the next code segment. - // - typedef std::map > >::value_type IntAABoxMapValType; - - typedef std::map > > >::value_type IntAAAFBIDMapValType; - - if (m_leveldata.DistributionMap()[i] != MyProc) continue; - // - // Insert with a hint since the indices are ordered lowest to highest. - // - IntAAAFBIDMapValType v1(i,Vector > >()); - - m_fbid.insert(m_fbid.end(),v1)->second.resize(m_amrlevel.level+1); - - IntAABoxMapValType v2(i,Vector >()); - - m_fbox.insert(m_fbox.end(),v2)->second.resize(m_amrlevel.level+1); - m_cbox.insert(m_cbox.end(),v2)->second.resize(m_amrlevel.level+1); - - m_ba.insert(m_ba.end(),std::map::value_type(i,amrex::grow(m_leveldata.boxArray()[i],m_growsize))); - } - - BoxList tempUnfillable(boxType); - BoxList unfillableThisLevel(boxType); - Vector unfilledThisLevel; - Vector crse_boxes; - Vector pshifts(27); - - for (std::map::const_iterator it = m_ba.begin(), End = m_ba.end(); - it != End; - ++it) - { - const int bxidx = it->first; - const Box& box = it->second; - - unfilledThisLevel.clear(); - unfilledThisLevel.push_back(box); - - if (!topPDomain.contains(box)) - { - unfilledThisLevel.back() &= topPDomain; - - if (topLevel.geom.isAnyPeriodic()) - { - // - // May need to add additional unique pieces of valid region - // in order to do periodic copies into ghost cells. - // - topLevel.geom.periodicShift(topPDomain,box,pshifts); - - for (const auto& iv : pshifts) - { - Box shbox = box + iv; - shbox &= topPDomain; - - if (boxType.nodeCentered()) - { - for (int dir = 0; dir < BL_SPACEDIM; dir++) - { - if (iv[dir] > 0) - { - shbox.growHi(dir,-1); - } - else if (iv[dir] < 0) - { - shbox.growLo(dir,-1); - } - } - } - - if (shbox.ok()) - { - BoxList bl = amrex::boxDiff(shbox,box); - - unfilledThisLevel.insert(unfilledThisLevel.end(), bl.begin(), bl.end()); - } - } - } - } - - // cells outside physical boundaries are not included in unfilledThisLevel - - bool Done = false; - - Vector< Vector >& TheCrseBoxes = m_cbox[bxidx]; - Vector< Vector >& TheFineBoxes = m_fbox[bxidx]; - Vector< Vector< Vector > >& TheFBIDs = m_fbid[bxidx]; - - for (int l = m_amrlevel.level; l >= 0 && !Done; --l) - { - unfillableThisLevel.clear(); - - AmrLevel& theAmrLevel = *amrLevels[l]; - StateData& theState = theAmrLevel.state[m_index]; - const Box& thePDomain = theState.getDomain(); - const Geometry& theGeom = theAmrLevel.geom; - const bool is_periodic = theGeom.isAnyPeriodic(); - const IntVect& fine_ratio = theAmrLevel.fine_ratio; - Vector& FineBoxes = TheFineBoxes[l]; - // - // These are the boxes on this level contained in thePDomain - // that need to be filled in order to directly fill at the - // highest level or to interpolate up to the next higher level. - // - FineBoxes = unfilledThisLevel; - // - // Now build coarse boxes needed to interpolate to fine. - // - // If we're periodic and we're not at the finest level, we may - // need to get some additional data at this level in order to - // properly fill the CoarseBox()d versions of the fineboxes. - // - crse_boxes.clear(); - - for (const auto& fbx : FineBoxes) - { - crse_boxes.push_back(fbx); - - if (l != m_amrlevel.level) - { - const Box& cbox = m_map->CoarseBox(fbx,fine_ratio); - - crse_boxes.back() = cbox; - - if (is_periodic && !thePDomain.contains(cbox)) - { - theGeom.periodicShift(thePDomain,cbox,pshifts); - - for (const auto& iv : pshifts) - { - Box shbox = cbox + iv; - shbox &= thePDomain; - - if (boxType.nodeCentered()) - { - for (int dir = 0; dir < BL_SPACEDIM; dir++) - { - if (iv[dir] > 0) - { - shbox.growHi(dir,-1); - } - else if (iv[dir] < 0) - { - shbox.growLo(dir,-1); - } - } - } - - if (shbox.ok()) - { - crse_boxes.push_back(shbox); - } - } - } - } - } - - Vector< Vector >& FBIDs = TheFBIDs[l]; - Vector& CrseBoxes = TheCrseBoxes[l]; - - FBIDs.resize(crse_boxes.size()); - CrseBoxes.resize(crse_boxes.size()); - // - // Now attempt to get as much coarse data as possible. - // - for (int i = 0, M = CrseBoxes.size(); i < M; i++) - { - BL_ASSERT(tempUnfillable.isEmpty()); - - CrseBoxes[i] = crse_boxes[i]; - - BL_ASSERT(CrseBoxes[i].intersects(thePDomain)); - - theState.InterpAddBox(m_mfcd, - m_mfid[l], - &tempUnfillable, - FBIDs[i], - CrseBoxes[i], - m_time, - m_scomp, - 0, - m_ncomp, - extrap); - - unfillableThisLevel.catenate(tempUnfillable); - } - - unfillableThisLevel.intersect(thePDomain); - - if (unfillableThisLevel.isEmpty()) - { - Done = true; - } - else - { - unfilledThisLevel.clear(); - - unfilledThisLevel.insert(unfilledThisLevel.end(), - unfillableThisLevel.begin(), - unfillableThisLevel.end()); - } - } - } - - m_mfcd.CollectData(); -} - -void -FillPatchIterator::Initialize (int boxGrow, - Real time, - int idx, - int scomp, - int ncomp) -{ - BL_PROFILE("FillPatchIterator::Initialize"); - - BL_ASSERT(scomp >= 0); - BL_ASSERT(ncomp >= 1); - BL_ASSERT(0 <= idx && idx < AmrLevel::desc_lst.size()); - - const StateDescriptor& desc = AmrLevel::desc_lst[idx]; - - m_ncomp = ncomp; - m_range = desc.sameInterps(scomp,ncomp); - - m_fabs.define(m_leveldata.boxArray(),m_leveldata.DistributionMap(), - m_ncomp,boxGrow,MFInfo(),m_leveldata.Factory()); - - const IndexType& boxType = m_leveldata.boxArray().ixType(); - const int level = m_amrlevel.level; - - for (int i = 0, DComp = 0; i < static_cast(m_range.size()); i++) - { - const int SComp = m_range[i].first; - const int NComp = m_range[i].second; - - if (level == 0) - { - FillFromLevel0(time, idx, SComp, DComp, NComp); - } - else - { - if (level == 1 || - amrex::ProperlyNested(m_amrlevel.crse_ratio, - m_amrlevel.parent->blockingFactor(m_amrlevel.level), - boxGrow, boxType, desc.interp(SComp))) - { - FillFromTwoLevels(time, idx, SComp, DComp, NComp); - } else { - -#ifdef AMREX_USE_EB - amrex::Abort("Grids must be properly nested for EB"); -#endif - - static bool first = true; - if (first) { - first = false; - if (ParallelDescriptor::IOProcessor()) { - IntVect new_blocking_factor = m_amrlevel.parent->blockingFactor(m_amrlevel.level); - new_blocking_factor *= 2; - for (int j = 0; j < 10; ++j) { - if (amrex::ProperlyNested(m_amrlevel.crse_ratio, - new_blocking_factor, - boxGrow, boxType, desc.interp(SComp))) { - break; - } else { - new_blocking_factor *= 2; - } - } - std::cout << "WARNING: Grids are not properly nested. We might have to use\n" - << " two coarse levels to do fillpatch. Consider using\n"; - if (new_blocking_factor < IntVect{AMREX_D_DECL(128,128,128)}) { - std::cout << " amr.blocking_factor=" << new_blocking_factor; - } else { - std::cout << " larger amr.blocking_factor. "; - } - std::cout << std::endl; - } - } - - FillPatchIteratorHelper* fph = 0; - fph = new FillPatchIteratorHelper(m_amrlevel, - m_leveldata, - boxGrow, - time, - idx, - SComp, - NComp, - desc.interp(SComp)); - -#if defined(AMREX_CRSEGRNDOMP) || (!defined(AMREX_XSDK) && defined(CRSEGRNDOMP)) -#ifdef _OPENMP -#pragma omp parallel -#endif -#endif - for (MFIter mfi(m_fabs); mfi.isValid(); ++mfi) - { - fph->fill(m_fabs[mfi],DComp,mfi.index()); - } - - delete fph; - } - } - - DComp += NComp; - } - // - // Call hack to touch up fillPatched data. - // - m_amrlevel.set_preferred_boundary_values(m_fabs, - idx, - scomp, - 0, - ncomp, - time); -} - -void -FillPatchIterator::FillFromLevel0 (Real time, int idx, int scomp, int dcomp, int ncomp) -{ - BL_ASSERT(m_amrlevel.level == 0); - - StateData& statedata = m_amrlevel.state[idx]; - - Vector smf; - Vector stime; - statedata.getData(smf,stime,time); - - const Geometry& geom = m_amrlevel.geom; - - StateDataPhysBCFunct physbcf(statedata,scomp,geom); - - amrex::FillPatchSingleLevel (m_fabs, time, smf, stime, scomp, dcomp, ncomp, geom, physbcf); -} - -void -FillPatchIterator::FillFromTwoLevels (Real time, int idx, int scomp, int dcomp, int ncomp) -{ - int ilev_fine = m_amrlevel.level; - int ilev_crse = ilev_fine-1; - - BL_ASSERT(ilev_crse >= 0); - - AmrLevel& fine_level = m_amrlevel; - AmrLevel& crse_level = m_amrlevel.parent->getLevel(ilev_crse); - - const Geometry& geom_fine = fine_level.geom; - const Geometry& geom_crse = crse_level.geom; - - Vector smf_crse; - Vector stime_crse; - StateData& statedata_crse = crse_level.state[idx]; - statedata_crse.getData(smf_crse,stime_crse,time); - StateDataPhysBCFunct physbcf_crse(statedata_crse,scomp,geom_crse); - - Vector smf_fine; - Vector stime_fine; - StateData& statedata_fine = fine_level.state[idx]; - statedata_fine.getData(smf_fine,stime_fine,time); - StateDataPhysBCFunct physbcf_fine(statedata_fine,scomp,geom_fine); - - const StateDescriptor& desc = AmrLevel::desc_lst[idx]; - - amrex::FillPatchTwoLevels(m_fabs, time, - smf_crse, stime_crse, - smf_fine, stime_fine, - scomp, dcomp, ncomp, - geom_crse, geom_fine, - physbcf_crse, physbcf_fine, - crse_level.fineRatio(), - desc.interp(scomp), desc.getBCs()); -} - -static -bool -HasPhysBndry (const Box& b, - const Box& dmn, - const Geometry& geom) -{ - for (int i = 0; i < BL_SPACEDIM; i++) - { - if (!geom.isPeriodic(i)) - { - if (b.smallEnd(i) < dmn.smallEnd(i) || b.bigEnd(i) > dmn.bigEnd(i)) - { - return true; - } - } - } - - return false; -} - -static -void -FixUpPhysCorners (FArrayBox& fab, - AmrLevel& TheLevel, - int state_indx, - Real time, - int scomp, - int dcomp, - int ncomp) -{ - StateData& TheState = TheLevel.get_state_data(state_indx); - const Geometry& TheGeom = TheLevel.Geom(); - const Box& ProbDomain = TheState.getDomain(); - - if (!HasPhysBndry(fab.box(),ProbDomain,TheGeom)) return; - - FArrayBox tmp; - - Box GrownDomain = ProbDomain; - - for (int dir = 0; dir < BL_SPACEDIM; dir++) - { - if (!TheGeom.isPeriodic(dir)) - { - const int lo = ProbDomain.smallEnd(dir) - fab.box().smallEnd(dir); - const int hi = fab.box().bigEnd(dir) - ProbDomain.bigEnd(dir); - if (lo > 0) GrownDomain.growLo(dir,lo); - if (hi > 0) GrownDomain.growHi(dir,hi); - } - } - - for (int dir = 0; dir < BL_SPACEDIM; dir++) - { - if (!TheGeom.isPeriodic(dir)) continue; - - Box lo_slab = fab.box(); - Box hi_slab = fab.box(); - lo_slab.shift(dir, ProbDomain.length(dir)); - hi_slab.shift(dir,-ProbDomain.length(dir)); - lo_slab &= GrownDomain; - hi_slab &= GrownDomain; - - if (lo_slab.ok()) - { - lo_slab.shift(dir,-ProbDomain.length(dir)); - - BL_ASSERT(fab.box().contains(lo_slab)); - BL_ASSERT(HasPhysBndry(lo_slab,ProbDomain,TheGeom)); - - tmp.resize(lo_slab,ncomp); - tmp.copy(fab,dcomp,0,ncomp); - tmp.shift(dir,ProbDomain.length(dir)); - TheLevel.setPhysBoundaryValues(tmp, - state_indx, - time, - 0, - scomp, - ncomp); - tmp.shift(dir,-ProbDomain.length(dir)); - fab.copy(tmp,0,dcomp,ncomp); - } - - if (hi_slab.ok()) - { - hi_slab.shift(dir,ProbDomain.length(dir)); - - BL_ASSERT(fab.box().contains(hi_slab)); - BL_ASSERT(HasPhysBndry(hi_slab,ProbDomain,TheGeom)); - - tmp.resize(hi_slab,ncomp); - tmp.copy(fab,dcomp,0,ncomp); - tmp.shift(dir,-ProbDomain.length(dir)); - TheLevel.setPhysBoundaryValues(tmp, - state_indx, - time, - 0, - scomp, - ncomp); - tmp.shift(dir,ProbDomain.length(dir)); - fab.copy(tmp,0,dcomp,ncomp); - } - } -} - -void -FillPatchIteratorHelper::fill (FArrayBox& fab, - int dcomp, - int idx) -{ - BL_PROFILE("FillPatchIteratorHelper::fill()"); - - BL_ASSERT(fab.box() == m_ba[idx]); - BL_ASSERT(fab.nComp() >= dcomp + m_ncomp); - - Vector< Vector > > cfab(m_amrlevel.level+1); - Vector< Vector >& TheCrseBoxes = m_cbox[idx]; - Vector< Vector >& TheFineBoxes = m_fbox[idx]; - Vector< Vector< Vector > >& TheFBIDs = m_fbid[idx]; - const bool extrap = AmrLevel::desc_lst[m_index].extrap(); - auto& amrLevels = m_amrlevel.parent->getAmrLevels(); - // - // Build all coarse fabs from which we'll interpolate and - // fill them with coarse data as best we can. - // - for (int l = 0; l <= m_amrlevel.level; l++) - { - StateData& TheState = amrLevels[l]->state[m_index]; - const Vector& CrseBoxes = TheCrseBoxes[l]; - auto& CrseFabs = cfab[l]; - const Vector< Vector >& FBIDs = TheFBIDs[l]; - const int NC = CrseBoxes.size(); - - CrseFabs.resize(NC); - - for (int i = 0; i < NC; i++) - { - BL_ASSERT(CrseBoxes[i].ok()); - CrseFabs[i].reset(new FArrayBox(CrseBoxes[i],m_ncomp)); - } - - for (int i = 0; i < NC; i++) - { - // - // Set to special value we'll later check - // to ensure we've filled the FABs at the coarse level. - // - TheState.InterpFillFab(m_mfcd, - m_mfid[l], - FBIDs[i], - *CrseFabs[i], - m_time, - 0, - 0, - m_ncomp, - extrap); - } - } - // - // Now work from the bottom up interpolating to next higher level. - // - for (int l = 0; l < m_amrlevel.level; l++) - { - auto& CrseFabs = cfab[l]; - AmrLevel& TheLevel = *amrLevels[l]; - StateData& TheState = TheLevel.state[m_index]; - const Box& ThePDomain = TheState.getDomain(); - const int NC = CrseFabs.size(); - - if (TheLevel.geom.isAnyPeriodic()) - { - // - // Fill CrseFabs with periodic data in preparation for interp(). - // - for (int i = 0; i < NC; i++) - { - FArrayBox& dstfab = *CrseFabs[i]; - - if (ThePDomain.contains(dstfab.box())) continue; - - Vector pshifts(27); - - TheLevel.geom.periodicShift(ThePDomain,dstfab.box(),pshifts); - - for (const auto& iv : pshifts) - { - Box fullsrcbox = dstfab.box() + iv; - fullsrcbox &= ThePDomain; - - for (int j = 0; j < NC; j++) - { - const FArrayBox& srcfab = *CrseFabs[j]; - const Box& srcbox = fullsrcbox & srcfab.box(); - - if (srcbox.ok()) - { - const Box& dstbox = srcbox - iv; - - dstfab.copy(srcfab,srcbox,0,dstbox,0,m_ncomp); - } - } - } - } - } - // - // Set non-periodic BCs in coarse data -- what we interpolate with. - // This MUST come after the periodic fill mumbo-jumbo. - for (int i = 0; i < NC; ++i) - { - if ( ! ThePDomain.contains(CrseFabs[i]->box())) - { - TheLevel.setPhysBoundaryValues(*CrseFabs[i], - m_index, - m_time, - 0, - m_scomp, - m_ncomp); - } - } - - if (m_FixUpCorners) - { - for (int i = 0; i < NC; ++i) - { - FixUpPhysCorners(*CrseFabs[i],TheLevel,m_index,m_time,m_scomp,0,m_ncomp); - } - } - // - // Interpolate up to next level. - // - AmrLevel& crseAmrLevel = *amrLevels[l]; - AmrLevel& fineAmrLevel = *amrLevels[l+1]; - const IntVect& fine_ratio = crseAmrLevel.fine_ratio; - const Vector& FineBoxes = TheFineBoxes[l]; - StateData& fState = fineAmrLevel.state[m_index]; - const Box& fDomain = fState.getDomain(); - auto& FinerCrseFabs = cfab[l+1]; - const Vector& theBCs = AmrLevel::desc_lst[m_index].getBCs(); - const int NF = FineBoxes.size(); - - for (int ifine = 0; ifine < NF; ++ifine) - { - Vector bcr(m_ncomp); - FArrayBox finefab(FineBoxes[ifine],m_ncomp); - FArrayBox crsefab(m_map->CoarseBox(finefab.box(),fine_ratio),m_ncomp); - // - // Fill crsefab from m_cbox via copy on intersect. - // - for (int j = 0; j < NC; j++) { - crsefab.copy(*CrseFabs[j]); - } - // - // Get boundary conditions for the fine patch. - // - amrex::setBC(finefab.box(), - fDomain, - m_scomp, - 0, - m_ncomp, - theBCs, - bcr); - // - // Interpolate up to fine patch. - // - m_map->interp(crsefab, - 0, - finefab, - 0, - m_ncomp, - finefab.box(), - fine_ratio, - crseAmrLevel.geom, - fineAmrLevel.geom, - bcr, - m_scomp, - m_index, RunOn::Cpu); - // - // Copy intersect finefab into next level m_cboxes. - // - for (int j = 0, K = FinerCrseFabs.size(); j < K; ++j) { - FinerCrseFabs[j]->copy(finefab); - } - } - - CrseFabs.clear(); - } - // - // Now for the finest level stuff. - // - StateData& FineState = m_amrlevel.state[m_index]; - const Box& FineDomain = FineState.getDomain(); - const Geometry& FineGeom = m_amrlevel.geom; - auto& FinestCrseFabs = cfab[m_amrlevel.level]; - // - // Copy intersect coarse into destination fab. - // - for (int i = 0, N = FinestCrseFabs.size(); i < N; ++i) { - fab.copy(*FinestCrseFabs[i],0,dcomp,m_ncomp); - } - - if (FineGeom.isAnyPeriodic() && !FineDomain.contains(fab.box())) - { - Vector pshifts(27); - - FineGeom.periodicShift(FineDomain,fab.box(),pshifts); - - for (int i = 0, N = FinestCrseFabs.size(); i < N; i++) - { - for (const auto& iv : pshifts) - { - fab.shift(iv); - - Box src_dst = FinestCrseFabs[i]->box() & fab.box(); - src_dst &= FineDomain; - - if (src_dst.ok()) - fab.copy(*FinestCrseFabs[i],src_dst,0,src_dst,dcomp,m_ncomp); - - fab.shift(-iv); - } - } - } - // - // No longer need coarse data at finest level. - // - FinestCrseFabs.clear(); - // - // Final set of non-periodic BCs. - // - if (! FineState.getDomain().contains(fab.box())) - { - m_amrlevel.setPhysBoundaryValues(fab, - m_index, - m_time, - dcomp, - m_scomp, - m_ncomp); - } - - if (m_FixUpCorners) - { - FixUpPhysCorners(fab,m_amrlevel,m_index,m_time,m_scomp,dcomp,m_ncomp); - } -} - -FillPatchIteratorHelper::~FillPatchIteratorHelper () {} - -FillPatchIterator::~FillPatchIterator () {} - -void -AmrLevel::FillCoarsePatch (MultiFab& mf, - int dcomp, - Real time, - int idx, - int scomp, - int ncomp, - int nghost) -{ - BL_PROFILE("AmrLevel::FillCoarsePatch()"); - - // - // Must fill this region on crse level and interpolate. - // - BL_ASSERT(level != 0); - BL_ASSERT(ncomp <= (mf.nComp()-dcomp)); - BL_ASSERT(nghost <= mf.nGrow()); - BL_ASSERT(0 <= idx && idx < desc_lst.size()); - - int DComp = dcomp; - const StateDescriptor& desc = desc_lst[idx]; - const Box& pdomain = state[idx].getDomain(); - const BoxArray& mf_BA = mf.boxArray(); - const DistributionMapping& mf_DM = mf.DistributionMap(); - AmrLevel& clev = parent->getLevel(level-1); - const Geometry& cgeom = clev.geom; - - Box domain_g = pdomain; - for (int i = 0; i < BL_SPACEDIM; ++i) { - if (geom.isPeriodic(i)) { - domain_g.grow(i,nghost); - } - } - - std::vector< std::pair > ranges = desc.sameInterps(scomp,ncomp); - - BL_ASSERT(desc.inRange(scomp, ncomp)); - - for (int i = 0; i < static_cast(ranges.size()); i++) - { - const int SComp = ranges[i].first; - const int NComp = ranges[i].second; - Interpolater* mapper = desc.interp(SComp); - - BoxArray crseBA(mf_BA.size()); - - for (int j = 0, N = crseBA.size(); j < N; ++j) - { - BL_ASSERT(mf_BA[j].ixType() == desc.getType()); - const Box& bx = amrex::grow(mf_BA[j],nghost) & domain_g; - crseBA.set(j,mapper->CoarseBox(bx, crse_ratio)); - } - -#ifdef AMREX_USE_EB - MultiFab crseMF(crseBA,mf_DM,NComp,0,MFInfo(), - EBFArrayBoxFactory(cgeom, crseBA, mf_DM, {0,0,0}, EBSupport::basic)); -#else - MultiFab crseMF(crseBA,mf_DM,NComp,0); -#endif - - if ( level == 1 - || amrex::ProperlyNested(crse_ratio, parent->blockingFactor(level), - nghost, mf_BA.ixType(), mapper) ) - { - StateData& statedata = clev.state[idx]; - - Vector smf; - Vector stime; - statedata.getData(smf,stime,time); - - StateDataPhysBCFunct physbcf(statedata,SComp,cgeom); - - amrex::FillPatchSingleLevel(crseMF,time,smf,stime,SComp,0,NComp,cgeom,physbcf); - } - else - { - FillPatch(clev,crseMF,0,time,idx,SComp,NComp,0); - } - -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(mf); mfi.isValid(); ++mfi) - { - const Box& dbx = amrex::grow(mfi.validbox(),nghost) & domain_g; - - Vector bcr(ncomp); - - amrex::setBC(dbx,pdomain,SComp,0,NComp,desc.getBCs(),bcr); - - mapper->interp(crseMF[mfi], - 0, - mf[mfi], - DComp, - NComp, - dbx, - crse_ratio, - cgeom, - geom, - bcr, - SComp, - idx, RunOn::Cpu); - } - - StateDataPhysBCFunct physbcf(state[idx],SComp,geom); - physbcf.FillBoundary(mf, DComp, NComp, time); - - DComp += NComp; - } -} - -std::unique_ptr -AmrLevel::derive (const std::string& name, - Real time, - int ngrow) -{ - BL_ASSERT(ngrow >= 0); - - std::unique_ptr mf; - - int index, scomp, ncomp; - - if (isStateVariable(name, index, scomp)) - { - mf.reset(new MultiFab(state[index].boxArray(), dmap, 1, ngrow, MFInfo(), *m_factory)); - FillPatch(*this,*mf,ngrow,time,index,scomp,1); - } - else if (const DeriveRec* rec = derive_lst.get(name)) - { - rec->getRange(0, index, scomp, ncomp); - - const BoxArray& srcBA = state[index].boxArray(); - - BoxArray dstBA(srcBA); - dstBA.convert(rec->deriveType()); - - int ngrow_src = ngrow; - { - Box bx0 = srcBA[0]; - Box bx1 = rec->boxMap()(bx0); - int g = bx0.smallEnd(0) - bx1.smallEnd(0); - ngrow_src += g; - } - - MultiFab srcMF(srcBA, dmap, rec->numState(), ngrow_src, MFInfo(), *m_factory); - - for (int k = 0, dc = 0; k < rec->numRange(); k++, dc += ncomp) - { - rec->getRange(k, index, scomp, ncomp); - FillPatch(*this,srcMF,ngrow_src,time,index,scomp,ncomp,dc); - } - - mf.reset(new MultiFab(dstBA, dmap, rec->numDerive(), ngrow, MFInfo(), *m_factory)); - -#if defined(AMREX_CRSEGRNDOMP) || (!defined(AMREX_XSDK) && defined(CRSEGRNDOMP)) -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(*mf,true); mfi.isValid(); ++mfi) - { - int grid_no = mfi.index(); - Real* ddat = (*mf)[mfi].dataPtr(); - const int* dlo = (*mf)[mfi].loVect(); - const int* dhi = (*mf)[mfi].hiVect(); - const Box& gtbx = mfi.growntilebox(); - const int* lo = gtbx.loVect(); - const int* hi = gtbx.hiVect(); - int n_der = rec->numDerive(); - Real* cdat = srcMF[mfi].dataPtr(); - const int* clo = srcMF[mfi].loVect(); - const int* chi = srcMF[mfi].hiVect(); - int n_state = rec->numState(); - const int* dom_lo = state[index].getDomain().loVect(); - const int* dom_hi = state[index].getDomain().hiVect(); - const Real* dx = geom.CellSize(); - const int* bcr = rec->getBC(); - const RealBox temp (gtbx,geom.CellSize(),geom.ProbLo()); - const Real* xlo = temp.lo(); - Real dt = parent->dtLevel(level); - - if (rec->derFunc() != static_cast(0)){ - rec->derFunc()(ddat,ARLIM(dlo),ARLIM(dhi),&n_der, - cdat,ARLIM(clo),ARLIM(chi),&n_state, - lo,hi,dom_lo,dom_hi,dx,xlo,&time,&dt,bcr, - &level,&grid_no); - } else if (rec->derFunc3D() != static_cast(0)){ - rec->derFunc3D()(ddat,ARLIM_3D(dlo),ARLIM_3D(dhi),&n_der, - cdat,ARLIM_3D(clo),ARLIM_3D(chi),&n_state, - ARLIM_3D(lo),ARLIM_3D(hi), - ARLIM_3D(dom_lo),ARLIM_3D(dom_hi), - ZFILL(dx),ZFILL(xlo), - &time,&dt, - AMREX_BCREC_3D(bcr), - &level,&grid_no); - } else { - amrex::Error("AmeLevel::derive: no function available"); - } - } -#else - for (MFIter mfi(srcMF); mfi.isValid(); ++mfi) - { - int grid_no = mfi.index(); - const RealBox gridloc(grids[grid_no],geom.CellSize(),geom.ProbLo()); - Real* ddat = (*mf)[mfi].dataPtr(); - const int* dlo = (*mf)[mfi].loVect(); - const int* dhi = (*mf)[mfi].hiVect(); - int n_der = rec->numDerive(); - Real* cdat = srcMF[mfi].dataPtr(); - const int* clo = srcMF[mfi].loVect(); - const int* chi = srcMF[mfi].hiVect(); - int n_state = rec->numState(); - const int* dom_lo = state[index].getDomain().loVect(); - const int* dom_hi = state[index].getDomain().hiVect(); - const Real* dx = geom.CellSize(); - const int* bcr = rec->getBC(); - const Real* xlo = gridloc.lo(); - Real dt = parent->dtLevel(level); - - if (rec->derFunc() != static_cast(0)){ - rec->derFunc()(ddat,ARLIM(dlo),ARLIM(dhi),&n_der, - cdat,ARLIM(clo),ARLIM(chi),&n_state, - dlo,dhi,dom_lo,dom_hi,dx,xlo,&time,&dt,bcr, - &level,&grid_no); - } else if (rec->derFunc3D() != static_cast(0)){ - rec->derFunc3D()(ddat,ARLIM_3D(dlo),ARLIM_3D(dhi),&n_der, - cdat,ARLIM_3D(clo),ARLIM_3D(chi),&n_state, - ARLIM_3D(dlo),ARLIM_3D(dhi), - ARLIM_3D(dom_lo),ARLIM_3D(dom_hi), - ZFILL(dx),ZFILL(xlo), - &time,&dt, - AMREX_BCREC_3D(bcr), - &level,&grid_no); - } else { - amrex::Error("AmeLevel::derive: no function available"); - } - } -#endif - } - else - { - // - // If we got here, cannot derive given name. - // - std::string msg("AmrLevel::derive(MultiFab*): unknown variable: "); - msg += name; - amrex::Error(msg.c_str()); - } - - return mf; -} - -void -AmrLevel::derive (const std::string& name, - Real time, - MultiFab& mf, - int dcomp) -{ - BL_ASSERT(dcomp < mf.nComp()); - - const int ngrow = mf.nGrow(); - - int index, scomp, ncomp; - - if (isStateVariable(name,index,scomp)) - { - FillPatch(*this,mf,ngrow,time,index,scomp,1); - } - else if (const DeriveRec* rec = derive_lst.get(name)) - { - rec->getRange(0,index,scomp,ncomp); - - const BoxArray& srcBA = state[index].boxArray(); - - int ngrow_src = ngrow; - { - Box bx0 = srcBA[0]; - Box bx1 = rec->boxMap()(bx0); - int g = bx0.smallEnd(0) - bx1.smallEnd(0); - ngrow_src += g; - } - - MultiFab srcMF(srcBA,dmap,rec->numState(),ngrow_src, MFInfo(), *m_factory); - - for (int k = 0, dc = 0; k < rec->numRange(); k++, dc += ncomp) - { - rec->getRange(k,index,scomp,ncomp); - - FillPatch(*this,srcMF,ngrow_src,time,index,scomp,ncomp,dc); - } - -#if defined(AMREX_CRSEGRNDOMP) || (!defined(AMREX_XSDK) && defined(CRSEGRNDOMP)) -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) - { - int idx = mfi.index(); - Real* ddat = mf[mfi].dataPtr(dcomp); - const int* dlo = mf[mfi].loVect(); - const int* dhi = mf[mfi].hiVect(); - const Box& gtbx = mfi.growntilebox(); - const int* lo = gtbx.loVect(); - const int* hi = gtbx.hiVect(); - int n_der = rec->numDerive(); - Real* cdat = srcMF[mfi].dataPtr(); - const int* clo = srcMF[mfi].loVect(); - const int* chi = srcMF[mfi].hiVect(); - int n_state = rec->numState(); - const int* dom_lo = state[index].getDomain().loVect(); - const int* dom_hi = state[index].getDomain().hiVect(); - const Real* dx = geom.CellSize(); - const int* bcr = rec->getBC(); - const RealBox& temp = RealBox(gtbx,geom.CellSize(),geom.ProbLo()); - const Real* xlo = temp.lo(); - Real dt = parent->dtLevel(level); - - if (rec->derFunc() != static_cast(0)){ - rec->derFunc()(ddat,ARLIM(dlo),ARLIM(dhi),&n_der, - cdat,ARLIM(clo),ARLIM(chi),&n_state, - lo,hi,dom_lo,dom_hi,dx,xlo,&time,&dt,bcr, - &level,&idx); - } else if (rec->derFunc3D() != static_cast(0)){ - rec->derFunc3D()(ddat,ARLIM_3D(dlo),ARLIM_3D(dhi),&n_der, - cdat,ARLIM_3D(clo),ARLIM_3D(chi),&n_state, - ARLIM_3D(lo),ARLIM_3D(hi), - ARLIM_3D(dom_lo),ARLIM_3D(dom_hi), - ZFILL(dx),ZFILL(xlo), - &time,&dt, - AMREX_BCREC_3D(bcr), - &level,&idx); - } else { - amrex::Error("AmeLevel::derive: no function available"); - } - } -#else - for (MFIter mfi(srcMF); mfi.isValid(); ++mfi) - { - int idx = mfi.index(); - Real* ddat = mf[mfi].dataPtr(dcomp); - const int* dlo = mf[mfi].loVect(); - const int* dhi = mf[mfi].hiVect(); - int n_der = rec->numDerive(); - Real* cdat = srcMF[mfi].dataPtr(); - const int* clo = srcMF[mfi].loVect(); - const int* chi = srcMF[mfi].hiVect(); - int n_state = rec->numState(); - const int* dom_lo = state[index].getDomain().loVect(); - const int* dom_hi = state[index].getDomain().hiVect(); - const Real* dx = geom.CellSize(); - const int* bcr = rec->getBC(); - const RealBox& temp = RealBox(mf[mfi].box(),geom.CellSize(),geom.ProbLo()); - const Real* xlo = temp.lo(); - Real dt = parent->dtLevel(level); - - if (rec->derFunc() != static_cast(0)){ - rec->derFunc()(ddat,ARLIM(dlo),ARLIM(dhi),&n_der, - cdat,ARLIM(clo),ARLIM(chi),&n_state, - dlo,dhi,dom_lo,dom_hi,dx,xlo,&time,&dt,bcr, - &level,&idx); - } else if (rec->derFunc3D() != static_cast(0)){ - rec->derFunc3D()(ddat,ARLIM_3D(dlo),ARLIM_3D(dhi),&n_der, - cdat,ARLIM_3D(clo),ARLIM_3D(chi),&n_state, - ARLIM_3D(dlo),ARLIM_3D(dhi), - ARLIM_3D(dom_lo),ARLIM_3D(dom_hi), - ZFILL(dx),ZFILL(xlo), - &time,&dt, - AMREX_BCREC_3D(bcr), - &level,&idx); - } else { - amrex::Error("AmeLevel::derive: no function available"); - } - } -#endif - } - else - { - // - // If we got here, cannot derive given name. - // - std::string msg("AmrLevel::derive(MultiFab*): unknown variable: "); - msg += name; - amrex::Error(msg.c_str()); - } -} - -//! Update the distribution maps in StateData based on the size of the map -void -AmrLevel::UpdateDistributionMaps ( DistributionMapping& update_dmap ) -{ - long mapsize = update_dmap.size(); - - if (dmap.size() == mapsize) - { dmap = update_dmap; } - - for (int i = 0; i < state.size(); ++i) - { - if (state[i].DistributionMap().size() == mapsize) - { state[i].setDistributionMap(update_dmap); } - } -} - - - -Vector -AmrLevel::getBCArray (int State_Type, - int gridno, - int strt_comp, - int ncomp) -{ - Vector bc(2*BL_SPACEDIM*ncomp); - - BCRec bcr; - - for (int n = 0; n < ncomp; n++) - { - bcr = state[State_Type].getBC(strt_comp+n,gridno); - const int* b_rec = bcr.vect(); - for (int m = 0; m < 2*BL_SPACEDIM; m++) - bc[2*BL_SPACEDIM*n + m] = b_rec[m]; - } - - return bc; -} - -int -AmrLevel::okToRegrid () -{ - return true; -} - -void -AmrLevel::setPlotVariables () -{ - ParmParse pp("amr"); - - if (pp.contains("plot_vars")) - { - std::string nm; - - int nPltVars = pp.countval("plot_vars"); - - for (int i = 0; i < nPltVars; i++) - { - pp.get("plot_vars", nm, i); - - if (nm == "ALL") - parent->fillStatePlotVarList(); - else if (nm == "NONE") - parent->clearStatePlotVarList(); - else - parent->addStatePlotVar(nm); - } - } - else - { - // - // The default is to add them all. - // - parent->fillStatePlotVarList(); - } - - if (pp.contains("derive_plot_vars")) - { - std::string nm; - - int nDrvPltVars = pp.countval("derive_plot_vars"); - - for (int i = 0; i < nDrvPltVars; i++) - { - pp.get("derive_plot_vars", nm, i); - - if (nm == "ALL") - parent->fillDerivePlotVarList(); - else if (nm == "NONE") - parent->clearDerivePlotVarList(); - else - parent->addDerivePlotVar(nm); - } - } - else - { - // - // The default is to add none of them. - // - parent->clearDerivePlotVarList(); - } -} - -void -AmrLevel::setSmallPlotVariables () -{ - ParmParse pp("amr"); - - if (pp.contains("small_plot_vars")) - { - std::string nm; - - int nPltVars = pp.countval("small_plot_vars"); - - for (int i = 0; i < nPltVars; i++) - { - pp.get("small_plot_vars", nm, i); - - parent->addStateSmallPlotVar(nm); - } - } - else - { - // - // The default is to use none. - // - parent->clearStateSmallPlotVarList(); - } - - if (pp.contains("derive_small_plot_vars")) - { - std::string nm; - - int nDrvPltVars = pp.countval("derive_small_plot_vars"); - - for (int i = 0; i < nDrvPltVars; i++) - { - pp.get("derive_small_plot_vars", nm, i); - - if (nm == "ALL") - parent->fillDeriveSmallPlotVarList(); - else if (nm == "NONE") - parent->clearDeriveSmallPlotVarList(); - else - parent->addDeriveSmallPlotVar(nm); - } - } - else - { - // - // The default is to add none of them. - // - parent->clearDeriveSmallPlotVarList(); - } - -} - -AmrLevel::TimeLevel -AmrLevel::which_time (int indx, - Real time) const -{ - const Real oldtime = state[indx].prevTime(); - const Real newtime = state[indx].curTime(); - const Real haftime = .5 * (oldtime + newtime); - const Real qtime = oldtime + 0.25*(newtime-oldtime); - const Real tqtime = oldtime + 0.75*(newtime-oldtime); - const Real epsilon = 0.001 * (newtime - oldtime); - - BL_ASSERT(time >= oldtime-epsilon && time <= newtime+epsilon); - - if (time >= oldtime-epsilon && time <= oldtime+epsilon) - { - return AmrOldTime; - } - else if (time >= newtime-epsilon && time <= newtime+epsilon) - { - return AmrNewTime; - } - else if (time >= haftime-epsilon && time <= haftime+epsilon) - { - return AmrHalfTime; - } - else if (time >= qtime-epsilon && time <= qtime+epsilon) - { - return Amr1QtrTime; - } - else if (time >= tqtime-epsilon && time <= tqtime+epsilon) - { - return Amr3QtrTime; - } - return AmrOtherTime; -} - -Real -AmrLevel::estimateWork () -{ - return 1.0*countCells(); -} - -bool -AmrLevel::writePlotNow () -{ - return false; -} - -bool -AmrLevel::writeSmallPlotNow () -{ - return false; -} - -const BoxArray& AmrLevel::getAreaNotToTag() -{ - return m_AreaNotToTag; -} - -const Box& AmrLevel::getAreaToTag() -{ - return m_AreaToTag; -} - -void AmrLevel::setAreaNotToTag(BoxArray& ba) -{ - m_AreaNotToTag = ba; -} - -void AmrLevel::constructAreaNotToTag() -{ - if (level == 0 || !parent->useFixedCoarseGrids() || parent->useFixedUpToLevel()>level) - return; - - // We are restricting the tagging on the finest fixed level - if (parent->useFixedUpToLevel()==level) - { - // We use the next coarser level shrunk by one blockingfactor - // as the region in which we allow tagging. - // Why level-1? Because we always use the full domain at level 0 - // and therefore level 0 in initialba is level 1 in the AMR hierarchy, etc. - const Vector& initialba = parent->getInitialBA(); - Box tagarea(initialba[level-1].minimalBox()); - tagarea.grow(-parent->blockingFactor(level)); - m_AreaToTag = tagarea; - - // We disallow tagging in the remaining part of the domain. - BoxArray tagba = amrex::boxComplement(parent->Geom(level).Domain(),m_AreaToTag); - m_AreaNotToTag = tagba; - - BoxArray bxa(parent->Geom(level).Domain()); - BL_ASSERT(bxa.contains(m_AreaNotToTag)); - } - - if (parent->useFixedUpToLevel()getLevel(level-1).getAreaToTag(); - tagarea.refine(parent->refRatio(level-1)); - tagarea.grow(-parent->blockingFactor(level)); - m_AreaToTag = tagarea; - BoxArray tagba = amrex::boxComplement(parent->Geom(level).Domain(),m_AreaToTag); - m_AreaNotToTag = tagba; - } -} - -void -AmrLevel::FillPatch(AmrLevel& amrlevel, - MultiFab& leveldata, - int boxGrow, - Real time, - int index, - int scomp, - int ncomp, - int dcomp) -{ - BL_ASSERT(dcomp+ncomp-1 <= leveldata.nComp()); - BL_ASSERT(boxGrow <= leveldata.nGrow()); - FillPatchIterator fpi(amrlevel, leveldata, boxGrow, time, index, scomp, ncomp); - const MultiFab& mf_fillpatched = fpi.get_mf(); - MultiFab::Copy(leveldata, mf_fillpatched, 0, dcomp, ncomp, boxGrow); -} - -void -AmrLevel::LevelDirectoryNames(const std::string &dir, - std::string &LevelDir, - std::string &FullPath) -{ - LevelDir = amrex::Concatenate("Level_", level, 1); - // - // Now for the full pathname of that directory. - // - FullPath = dir; - if( ! FullPath.empty() && FullPath.back() != '/') { - FullPath += '/'; - } - FullPath += LevelDir; -} - -void -AmrLevel::CreateLevelDirectory (const std::string &dir) -{ - // Build directory to hold the MultiFabs in the StateData at this level. - // The directory is relative the the directory containing the Header file. - - std::string LevelDir, FullPath; - LevelDirectoryNames(dir, LevelDir, FullPath); - - if(ParallelDescriptor::IOProcessor()) { - if( ! amrex::UtilCreateDirectory(FullPath, 0755)) { - amrex::CreateDirectoryFailed(FullPath); - } - } - - levelDirectoryCreated = true; -} - -} - diff --git a/Src/AmrTask/Amr/AMReX_AmrTask.cpp b/Src/AmrTask/Amr/AMReX_AmrTask.cpp deleted file mode 100644 index b8c94e9c7fe..00000000000 --- a/Src/AmrTask/Amr/AMReX_AmrTask.cpp +++ /dev/null @@ -1,3056 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _OPENMP -#include -#endif - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef BL_LAZY -#include -#endif - -#ifdef AMREX_MEM_PROFILING -#include -#endif - -#ifdef BL_USE_ARRAYVIEW -#include -#endif - -namespace amrex { - -// -// Static class members. Set defaults in Initialize()!!! -// -std::list Amr::state_plot_vars; -std::list Amr::state_small_plot_vars; -std::list Amr::derive_plot_vars; -std::list Amr::derive_small_plot_vars; -bool Amr::first_plotfile; -bool Amr::first_smallplotfile; -Vector Amr::initial_ba; -Vector Amr::regrid_ba; - -namespace -{ - const std::string CheckPointVersion("CheckPointVersion_1.0"); - - bool initialized = false; -} - -namespace -{ - // - // These are all ParmParse'd in. Set defaults in Initialize()!!! - // - int plot_nfiles; - int mffile_nstreams; - int probinit_natonce; - bool plot_files_output; - int checkpoint_nfiles; - int regrid_on_restart; - int use_efficient_regrid; - int plotfile_on_restart; - int checkpoint_on_restart; - bool checkpoint_files_output; - int compute_new_dt_on_regrid; - bool precreateDirectories; - bool prereadFAHeaders; - VisMF::Header::Version plot_headerversion(VisMF::Header::Version_v1); - VisMF::Header::Version checkpoint_headerversion(VisMF::Header::Version_v1); - -} - -void -Amr::Initialize () -{ - if (initialized) return; - // - // Set all defaults here!!! - // - Amr::first_plotfile = true; - Amr::first_smallplotfile = true; - plot_nfiles = 64; - mffile_nstreams = 1; - probinit_natonce = 32; - plot_files_output = true; - checkpoint_nfiles = 64; - regrid_on_restart = 0; - use_efficient_regrid = 0; - plotfile_on_restart = 0; - checkpoint_on_restart = 0; - checkpoint_files_output = true; - compute_new_dt_on_regrid = 0; - precreateDirectories = true; - prereadFAHeaders = true; - plot_headerversion = VisMF::Header::Version_v1; - checkpoint_headerversion = VisMF::Header::Version_v1; - - amrex::ExecOnFinalize(Amr::Finalize); - - initialized = true; -} - -void -Amr::Finalize () -{ - Amr::state_plot_vars.clear(); - Amr::derive_plot_vars.clear(); - Amr::derive_small_plot_vars.clear(); - Amr::regrid_ba.clear(); - Amr::initial_ba.clear(); - - initialized = false; -} - -bool Amr::Plot_Files_Output () { return plot_files_output; } - -std::ostream& -Amr::DataLog (int i) -{ - return *datalog[i]; -} - -int -Amr::NumDataLogs () -{ - return datalog.size(); -} - -bool -Amr::RegridOnRestart () const -{ - return regrid_on_restart; -} - -void -Amr::setDtMin (const Vector& dt_min_in) -{ - for (int i = 0; i <= finest_level; i++) - dt_min[i] = dt_min_in[i]; -} - -Vector >& -Amr::getAmrLevels () -{ - return amr_level; -} - -long -Amr::cellCount (int lev) -{ - return amr_level[lev]->countCells(); -} - -int -Amr::numGrids (int lev) -{ - return amr_level[lev]->numGrids(); -} - -std::unique_ptr -Amr::derive (const std::string& name, - Real time, - int lev, - int ngrow) -{ - return amr_level[lev]->derive(name,time,ngrow); -} - -Amr::Amr () - : - AmrCore() -{ - Initialize(); - InitAmr(); -} - -Amr::Amr (const RealBox* rb, int max_level_in, const Vector& n_cell_in, int coord) - : - AmrCore(rb,max_level_in,n_cell_in,coord) -{ - Initialize(); - InitAmr(); -} - -void -Amr::InitAmr () -{ - BL_PROFILE("Amr::InitAmr()"); - // - // Determine physics class. - // - levelbld = getLevelBld(); - // - // Global function that define state variables. - // - levelbld->variableSetUp(); - // - // Set default values. - // - plot_int = -1; - small_plot_int = -1; - last_plotfile = 0; - last_smallplotfile = -1; - last_checkpoint = 0; - record_run_info = false; - record_grid_info = false; - file_name_digits = 5; - record_run_info_terse = false; - bUserStopRequest = false; - message_int = 10; - - for (int i = 0; i < BL_SPACEDIM; i++) - isPeriodic[i] = false; - - ParmParse pp("amr"); - // - // Check for command line flags. - // - pp.query("regrid_on_restart",regrid_on_restart); - pp.query("use_efficient_regrid",use_efficient_regrid); - pp.query("plotfile_on_restart",plotfile_on_restart); - pp.query("checkpoint_on_restart",checkpoint_on_restart); - - pp.query("compute_new_dt_on_regrid",compute_new_dt_on_regrid); - - pp.query("mffile_nstreams", mffile_nstreams); - pp.query("probinit_natonce", probinit_natonce); - - probinit_natonce = std::max(1, std::min(ParallelDescriptor::NProcs(), probinit_natonce)); - - pp.query("file_name_digits", file_name_digits); - - pp.query("initial_grid_file",initial_grids_file); - pp.query("regrid_file" , regrid_grids_file); - - pp.query("message_int", message_int); - - if (pp.contains("run_log")) - { - std::string log_file_name; - pp.get("run_log",log_file_name); - setRecordRunInfo(log_file_name); - } - if (pp.contains("run_log_terse")) - { - std::string log_file_name; - pp.get("run_log_terse",log_file_name); - setRecordRunInfoTerse(log_file_name); - } - if (pp.contains("grid_log")) - { - std::string grid_file_name; - pp.get("grid_log",grid_file_name); - setRecordGridInfo(grid_file_name); - } - - if (pp.contains("data_log")) - { - int num_datalogs = pp.countval("data_log"); - datalog.resize(num_datalogs); - datalogname.resize(num_datalogs); - pp.queryarr("data_log",datalogname,0,num_datalogs); - for (int i = 0; i < num_datalogs; i++) - setRecordDataInfo(i,datalogname[i]); - } - - probin_file = "probin"; // Make "probin" the default - - if (pp.contains("probin_file")) - { - pp.get("probin_file",probin_file); - } - // - // If set, then restart from checkpoint file. - // - pp.query("restart", restart_chkfile); - // - // If set, then restart from plotfile. - // - pp.query("restart_from_plotfile", restart_pltfile); - - int nlev = max_level+1; - dt_level.resize(nlev); - level_steps.resize(nlev); - level_count.resize(nlev); - n_cycle.resize(nlev); - dt_min.resize(nlev); - amr_level.resize(nlev); - // - // Set bogus values. - // - for (int i = 0; i < nlev; i++) - { - dt_level[i] = 1.e200; // Something nonzero so old & new will differ - level_steps[i] = 0; - level_count[i] = 0; - n_cycle[i] = 0; - dt_min[i] = 0.0; - } - - // Make the default regrid_int = 1 for all levels. - if (max_level > 0) - { - regrid_int.resize(max_level); - for (int i = 0; i < max_level; i++) - regrid_int[i] = 1; - } - - // - // Setup plot and checkpoint controls. - // - initPltAndChk(); - - // - // Setup subcycling controls. - // - initSubcycle(); - - // - // Read in the regrid interval if max_level > 0. - // - if (max_level > 0) - { - int numvals = pp.countval("regrid_int"); - if (numvals == 1) - { - // - // Set all values to the single available value. - // - int the_regrid_int = 0; - pp.query("regrid_int",the_regrid_int); - for (int i = 0; i < max_level; i++) - { - regrid_int[i] = the_regrid_int; - } - } - else if (numvals == 0) - { - amrex::Print(std::cerr) << "Using default regrid_int = 1 at all levels!\n"; - } - else if (numvals < max_level) - { - amrex::Error("You did not specify enough values of regrid_int"); - } - else - { - // - // Otherwise we expect a vector of max_level values - // - pp.queryarr("regrid_int",regrid_int,0,max_level); - } - } - - if (max_level > 0 && !initial_grids_file.empty()) - { -#define STRIP while( is.get() != '\n' ) {} - std::ifstream is(initial_grids_file.c_str(),std::ios::in); - - if (!is.good()) - amrex::FileOpenFailed(initial_grids_file); - - int in_finest,ngrid; - - is >> in_finest; - STRIP; - initial_ba.resize(in_finest); - - use_fixed_upto_level = in_finest; - if (in_finest > max_level) - amrex::Error("You have fewer levels in your inputs file then in your grids file!"); - - for (int lev = 1; lev <= in_finest; lev++) - { - BoxList bl; - is >> ngrid; - STRIP; - for (int i = 0; i < ngrid; i++) - { - Box bx; - is >> bx; - STRIP; - bx.refine(ref_ratio[lev-1]); - bl.push_back(bx); - } - initial_ba[lev-1].define(bl); - } - is.close(); - amrex::Print() << "Read initial_ba. Size is " << initial_ba.size() << "\n"; - -#undef STRIP - } - - if (max_level > 0 && !regrid_grids_file.empty()) - { -#define STRIP while( is.get() != '\n' ) {} - std::ifstream is(regrid_grids_file.c_str(),std::ios::in); - - if (!is.good()) - amrex::FileOpenFailed(regrid_grids_file); - - int in_finest,ngrid; - - is >> in_finest; - STRIP; - regrid_ba.resize(in_finest); - for (int lev = 1; lev <= in_finest; lev++) - { - BoxList bl; - is >> ngrid; - STRIP; - for (int i = 0; i < ngrid; i++) - { - Box bx; - is >> bx; - STRIP; - bx.refine(ref_ratio[lev-1]); - for (int idim = 0 ; idim < BL_SPACEDIM; ++idim) - { - if (bx.length(idim) > max_grid_size[lev][idim]) - { - amrex::Print() << "Grid " << bx << " too large" << '\n'; - amrex::Error(); - } - } - bl.push_back(bx); - } - regrid_ba[lev-1].define(bl); - } - is.close(); -#undef STRIP - } - - loadbalance_with_workestimates = 0; - pp.query("loadbalance_with_workestimates", loadbalance_with_workestimates); - - loadbalance_level0_int = 2; - pp.query("loadbalance_level0_int", loadbalance_level0_int); - - loadbalance_max_fac = 1.5; - pp.query("loadbalance_max_fac", loadbalance_max_fac); -} - -bool -Amr::isStatePlotVar (const std::string& name) -{ - for (std::list::const_iterator li = state_plot_vars.begin(), End = state_plot_vars.end(); - li != End; - ++li) - { - if (*li == name) { - return true; - } - } - return false; -} - -bool -Amr::isStateSmallPlotVar (const std::string& name) -{ - for (std::list::const_iterator li = state_small_plot_vars.begin(), End = state_small_plot_vars.end(); - li != End; - ++li) - { - if (*li == name) - return true; - } - return false; -} - -void -Amr::fillStatePlotVarList () -{ - state_plot_vars.clear(); - const DescriptorList &desc_lst = AmrLevel::get_desc_lst(); - for (int typ(0); typ < desc_lst.size(); ++typ) { - for (int comp(0); comp < desc_lst[typ].nComp(); ++comp) { - if (desc_lst[typ].getType() == IndexType::TheCellType()) { - state_plot_vars.push_back(desc_lst[typ].name(comp)); - } - } - } -} - -void -Amr::clearStatePlotVarList () -{ - state_plot_vars.clear(); -} - -void -Amr::clearStateSmallPlotVarList () -{ - state_small_plot_vars.clear(); -} - -void -Amr::addStatePlotVar (const std::string& name) -{ - if ( ! isStatePlotVar(name)) { - state_plot_vars.push_back(name); - } -} - -void -Amr::addStateSmallPlotVar (const std::string& name) -{ - if (!isStateSmallPlotVar(name)) - state_small_plot_vars.push_back(name); -} - -void -Amr::deleteStatePlotVar (const std::string& name) -{ - if (isStatePlotVar(name)) { - state_plot_vars.remove(name); - } -} - -bool -Amr::isDerivePlotVar (const std::string& name) -{ - for (std::list::const_iterator li = derive_plot_vars.begin(), End = derive_plot_vars.end(); - li != End; - ++li) - { - if (*li == name) { - return true; - } - } - - return false; -} - -bool -Amr::isDeriveSmallPlotVar (const std::string& name) -{ - for (std::list::const_iterator li = derive_small_plot_vars.begin(), End = derive_small_plot_vars.end(); - li != End; - ++li) - { - if (*li == name) { - return true; - } - } - - return false; -} - -void -Amr::fillDerivePlotVarList () -{ - derive_plot_vars.clear(); - DeriveList& derive_lst = AmrLevel::get_derive_lst(); - std::list& dlist = derive_lst.dlist(); - for (std::list::const_iterator it = dlist.begin(), End = dlist.end(); - it != End; - ++it) - { - if (it->deriveType() == IndexType::TheCellType()) - { - derive_plot_vars.push_back(it->name()); - } - } -} - -void -Amr::fillDeriveSmallPlotVarList () -{ - derive_small_plot_vars.clear(); - DeriveList& derive_lst = AmrLevel::get_derive_lst(); - std::list& dlist = derive_lst.dlist(); - for (std::list::const_iterator it = dlist.begin(), End = dlist.end(); - it != End; - ++it) - { - if (it->deriveType() == IndexType::TheCellType()) - { - derive_small_plot_vars.push_back(it->name()); - } - } -} - -void -Amr::clearDerivePlotVarList () -{ - derive_plot_vars.clear(); -} - -void -Amr::clearDeriveSmallPlotVarList () -{ - derive_small_plot_vars.clear(); -} - -void -Amr::addDerivePlotVar (const std::string& name) -{ - if (!isDerivePlotVar(name)) - derive_plot_vars.push_back(name); -} - -void -Amr::addDeriveSmallPlotVar (const std::string& name) -{ - if (!isDeriveSmallPlotVar(name)) - derive_small_plot_vars.push_back(name); -} - -void -Amr::deleteDerivePlotVar (const std::string& name) -{ - if (isDerivePlotVar(name)) - derive_plot_vars.remove(name); -} - -void -Amr::deleteDeriveSmallPlotVar (const std::string& name) -{ - if (isDeriveSmallPlotVar(name)) - derive_small_plot_vars.remove(name); -} - -Amr::~Amr () -{ - levelbld->variableCleanUp(); - - Amr::Finalize(); -} - -void -Amr::setRecordGridInfo (const std::string& filename) -{ - record_grid_info = true; - if (ParallelDescriptor::IOProcessor()) - { - gridlog.open(filename.c_str(),std::ios::out|std::ios::app); - if (!gridlog.good()) - amrex::FileOpenFailed(filename); - } - ParallelDescriptor::Barrier("Amr::setRecordGridInfo"); -} - -void -Amr::setRecordRunInfo (const std::string& filename) -{ - record_run_info = true; - if (ParallelDescriptor::IOProcessor()) - { - runlog.open(filename.c_str(),std::ios::out|std::ios::app); - if (!runlog.good()) - amrex::FileOpenFailed(filename); - } - ParallelDescriptor::Barrier("Amr::setRecordRunInfo"); -} - -void -Amr::setRecordRunInfoTerse (const std::string& filename) -{ - record_run_info_terse = true; - if (ParallelDescriptor::IOProcessor()) - { - runlog_terse.open(filename.c_str(),std::ios::out|std::ios::app); - if (!runlog_terse.good()) - amrex::FileOpenFailed(filename); - } - ParallelDescriptor::Barrier("Amr::setRecordRunInfoTerse"); -} - -void -Amr::setRecordDataInfo (int i, const std::string& filename) -{ - if (ParallelDescriptor::IOProcessor()) - { - datalog[i].reset(new std::fstream); - datalog[i]->open(filename.c_str(),std::ios::out|std::ios::app); - if (!datalog[i]->good()) - amrex::FileOpenFailed(filename); - } - ParallelDescriptor::Barrier("Amr::setRecordDataInfo"); -} - -void -Amr::setDtLevel (const Vector& dt_lev) -{ - for (int i = 0; i <= finest_level; i++) - dt_level[i] = dt_lev[i]; -} - -void -Amr::setDtLevel (Real dt, int lev) -{ - dt_level[lev] = dt; -} - -void -Amr::setNCycle (const Vector& ns) -{ - for (int i = 0; i <= finest_level; i++) - n_cycle[i] = ns[i]; -} - -long -Amr::cellCount () -{ - long cnt = 0; - for (int i = 0; i <= finest_level; i++) - cnt += amr_level[i]->countCells(); - return cnt; -} - -int -Amr::numGrids () -{ - int cnt = 0; - for (int i = 0; i <= finest_level; i++) - cnt += amr_level[i]->numGrids(); - return cnt; -} - -int -Amr::okToContinue () -{ - int ok = true; - for (int i = 0; ok && (i <= finest_level); i++) - ok = ok && amr_level[i]->okToContinue(); - if(bUserStopRequest) { - ok = false; - } - return ok; -} - -void -Amr::writePlotFile () -{ - if ( ! Plot_Files_Output()) { - return; - } - - BL_PROFILE_REGION_START("Amr::writePlotFile()"); - BL_PROFILE("Amr::writePlotFile()"); - - VisMF::SetNOutFiles(plot_nfiles); - VisMF::Header::Version currentVersion(VisMF::GetHeaderVersion()); - VisMF::SetHeaderVersion(plot_headerversion); - - if (first_plotfile) { - first_plotfile = false; - amr_level[0]->setPlotVariables(); - } - - Real dPlotFileTime0 = ParallelDescriptor::second(); - - const std::string& pltfile = amrex::Concatenate(plot_file_root,level_steps[0],file_name_digits); - - if (verbose > 0) { - amrex::Print() << "PLOTFILE: file = " << pltfile << '\n'; - } - - if (record_run_info && ParallelDescriptor::IOProcessor()) { - runlog << "PLOTFILE: file = " << pltfile << '\n'; - } - - amrex::StreamRetry sretry(pltfile, abort_on_stream_retry_failure, - stream_max_tries); - - const std::string pltfileTemp(pltfile + ".temp"); - - while(sretry.TryFileOutput()) { - // - // if either the pltfile or pltfileTemp exists, rename them - // to move them out of the way. then create pltfile - // with the temporary name, then rename it back when - // it is finished writing. then stream retry can rename - // it to a bad suffix if there were stream errors. - // - - if(precreateDirectories) { // ---- make all directories at once - amrex::UtilRenameDirectoryToOld(pltfile, false); // dont call barrier - if (verbose > 1) { - amrex::Print() << "IOIOIOIO: precreating directories for " << pltfileTemp << "\n"; - } - amrex::PreBuildDirectorHierarchy(pltfileTemp, "Level_", finest_level + 1, true); // call barrier - } else { - amrex::UtilRenameDirectoryToOld(pltfile, false); // dont call barrier - amrex::UtilCreateCleanDirectory(pltfileTemp, true); // call barrier - } - - std::string HeaderFileName(pltfileTemp + "/Header"); - - VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize()); - - std::ofstream HeaderFile; - - HeaderFile.rdbuf()->pubsetbuf(io_buffer.dataPtr(), io_buffer.size()); - - int old_prec(0); - - if (ParallelDescriptor::IOProcessor()) { - // - // Only the IOProcessor() writes to the header file. - // - HeaderFile.open(HeaderFileName.c_str(), std::ios::out | std::ios::trunc | - std::ios::binary); - if ( ! HeaderFile.good()) { - amrex::FileOpenFailed(HeaderFileName); - } - old_prec = HeaderFile.precision(15); - } - - for (int k(0); k <= finest_level; ++k) { - amr_level[k]->writePlotFile(pltfileTemp, HeaderFile); - } - - if (ParallelDescriptor::IOProcessor()) { - HeaderFile.precision(old_prec); - if ( ! HeaderFile.good()) { - amrex::Error("Amr::writePlotFile() failed"); - } - } - - last_plotfile = level_steps[0]; - - if (verbose > 0) { - const int IOProc = ParallelDescriptor::IOProcessorNumber(); - Real dPlotFileTime = ParallelDescriptor::second() - dPlotFileTime0; - - ParallelDescriptor::ReduceRealMax(dPlotFileTime,IOProc); - - amrex::Print() << "Write plotfile time = " << dPlotFileTime << " seconds" << "\n\n"; - } - ParallelDescriptor::Barrier("Amr::writePlotFile::end"); - - if(ParallelDescriptor::IOProcessor()) { - std::rename(pltfileTemp.c_str(), pltfile.c_str()); - } - ParallelDescriptor::Barrier("Renaming temporary plotfile."); - // - // the plotfile file now has the regular name - // - - } // end while - - VisMF::SetHeaderVersion(currentVersion); - - BL_PROFILE_REGION_STOP("Amr::writePlotFile()"); -} - -void -Amr::writeSmallPlotFile () -{ - if ( ! Plot_Files_Output()) { - return; - } - - BL_PROFILE_REGION_START("Amr::writeSmallPlotFile()"); - BL_PROFILE("Amr::writeSmallPlotFile()"); - - VisMF::SetNOutFiles(plot_nfiles); - VisMF::Header::Version currentVersion(VisMF::GetHeaderVersion()); - VisMF::SetHeaderVersion(plot_headerversion); - - if (first_smallplotfile) { - first_smallplotfile = false; - amr_level[0]->setSmallPlotVariables(); - } - - // Don't continue if we have no variables to plot. - - if (stateSmallPlotVars().size() == 0) { - return; - } - - Real dPlotFileTime0 = ParallelDescriptor::second(); - - const std::string& pltfile = amrex::Concatenate(small_plot_file_root, - level_steps[0], - file_name_digits); - - if (verbose > 0) { - amrex::Print() << "SMALL PLOTFILE: file = " << pltfile << '\n'; - } - - if (record_run_info && ParallelDescriptor::IOProcessor()) { - runlog << "SMALL PLOTFILE: file = " << pltfile << '\n'; - } - - amrex::StreamRetry sretry(pltfile, abort_on_stream_retry_failure, - stream_max_tries); - - const std::string pltfileTemp(pltfile + ".temp"); - - while(sretry.TryFileOutput()) { - // - // if either the pltfile or pltfileTemp exists, rename them - // to move them out of the way. then create pltfile - // with the temporary name, then rename it back when - // it is finished writing. then stream retry can rename - // it to a bad suffix if there were stream errors. - // - if(precreateDirectories) { // ---- make all directories at once - amrex::UtilRenameDirectoryToOld(pltfile, false); // dont call barrier - amrex::UtilCreateCleanDirectory(pltfileTemp, false); // dont call barrier - for(int i(0); i <= finest_level; ++i) { - amr_level[i]->CreateLevelDirectory(pltfileTemp); - } - ParallelDescriptor::Barrier("Amr::precreate smallplotfile Directories"); - } else { - amrex::UtilRenameDirectoryToOld(pltfile, false); // dont call barrier - amrex::UtilCreateCleanDirectory(pltfileTemp, true); // call barrier - } - - - std::string HeaderFileName(pltfileTemp + "/Header"); - - VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize()); - - std::ofstream HeaderFile; - - HeaderFile.rdbuf()->pubsetbuf(io_buffer.dataPtr(), io_buffer.size()); - - int old_prec(0); - - if (ParallelDescriptor::IOProcessor()) { - // - // Only the IOProcessor() writes to the header file. - // - HeaderFile.open(HeaderFileName.c_str(), std::ios::out | std::ios::trunc | - std::ios::binary); - if ( ! HeaderFile.good()) { - amrex::FileOpenFailed(HeaderFileName); - } - old_prec = HeaderFile.precision(15); - } - - for (int k(0); k <= finest_level; ++k) { - amr_level[k]->writeSmallPlotFile(pltfileTemp, HeaderFile); - } - - if (ParallelDescriptor::IOProcessor()) { - HeaderFile.precision(old_prec); - if ( ! HeaderFile.good()) { - amrex::Error("Amr::writeSmallPlotFile() failed"); - } - } - - last_smallplotfile = level_steps[0]; - - if (verbose > 0) { - const int IOProc = ParallelDescriptor::IOProcessorNumber(); - Real dPlotFileTime = ParallelDescriptor::second() - dPlotFileTime0; - - ParallelDescriptor::ReduceRealMax(dPlotFileTime,IOProc); - - amrex::Print() << "Write small plotfile time = " << dPlotFileTime << " seconds" << "\n\n"; - } - ParallelDescriptor::Barrier("Amr::writeSmallPlotFile::end"); - - if(ParallelDescriptor::IOProcessor()) { - std::rename(pltfileTemp.c_str(), pltfile.c_str()); - } - ParallelDescriptor::Barrier("Renaming temporary plotfile."); - // - // the plotfile file now has the regular name - // - - } // end while - - VisMF::SetHeaderVersion(currentVersion); - - BL_PROFILE_REGION_STOP("Amr::writeSmallPlotFile()"); -} - -void -Amr::checkInput () -{ - if (max_level < 0) - amrex::Error("checkInput: max_level not set"); - // - // Check that blocking_factor is a power of 2. - // - for (int i = 0; i < max_level; i++) - { - for (int idim = 0; idim < BL_SPACEDIM; ++idim) - { - int k = blocking_factor[i][idim]; - while ( k > 0 && (k%2 == 0) ) - k /= 2; - if (k != 1) - amrex::Error("Amr::checkInput: blocking_factor not power of 2"); - } - } - // - // Check level dependent values. - // - for (int i = 0; i < max_level; i++) - { - if (MaxRefRatio(i) < 2 || MaxRefRatio(i) > 12) - amrex::Error("Amr::checkInput: bad ref_ratios"); - } - const Box& domain = Geom(0).Domain(); - if (!domain.ok()) - amrex::Error("level 0 domain bad or not set"); - // - // Check that domain size is a multiple of blocking_factor[0]. - // - for (int i = 0; i < BL_SPACEDIM; i++) - { - int len = domain.length(i); - if (len%blocking_factor[0][i] != 0) - amrex::Error("domain size not divisible by blocking_factor"); - } - // - // Check that max_grid_size is even. - // - for (int i = 0; i < max_level; i++) - { - for (int idim = 0; idim < BL_SPACEDIM; ++idim) { - if (max_grid_size[i][idim]%2 != 0) { - amrex::Error("max_grid_size is not even"); - } - } - } - - // - // Check that max_grid_size is a multiple of blocking_factor at every level. - // - for (int i = 0; i < max_level; i++) - { - for (int idim = 0; idim < BL_SPACEDIM; ++idim) { - if (max_grid_size[i][idim]%blocking_factor[i][idim] != 0) { - amrex::Error("max_grid_size not divisible by blocking_factor"); - } - } - } - - if( ! Geom(0).ProbDomain().ok()) { - amrex::Error("Amr::checkInput: bad physical problem size"); - } - - if(verbose > 0) { - amrex::Print() << "Successfully read inputs file ... " << '\n'; - } -} - -void -Amr::init (Real strt_time, - Real stop_time) -{ - BL_PROFILE_REGION_START("Amr::init()"); - BL_PROFILE("Amr::init()"); - if( ! restart_chkfile.empty() && restart_chkfile != "init") - { - restart(restart_chkfile); - } - else - { - initialInit(strt_time,stop_time); - checkPoint(); - if(plot_int > 0 || plot_per > 0) { - writePlotFile(); - } - if (small_plot_int > 0 || small_plot_per > 0) - writeSmallPlotFile(); - } - -#ifdef BL_COMM_PROFILING - Vector probDomain(maxLevel()+1); - for(int i(0); i < probDomain.size(); ++i) { - probDomain[i] = Geom(i).Domain(); - } - BL_COMM_PROFILE_INITAMR(finest_level, max_level, ref_ratio, probDomain); -#endif - BL_PROFILE_REGION_STOP("Amr::init()"); -} - -void -Amr::readProbinFile (int& a_init) -{ - BL_PROFILE("Amr::readProbinFile()"); - // - // Populate integer array with name of probin file. - // - int probin_file_length = probin_file.length(); - - Vector probin_file_name(probin_file_length); - - for (int i = 0; i < probin_file_length; i++) - probin_file_name[i] = probin_file[i]; - - if (verbose > 0) - amrex::Print() << "Starting to call amrex_probinit ... \n"; - - const int nAtOnce = probinit_natonce; - const int MyProc = ParallelDescriptor::MyProc(); - const int NProcs = ParallelDescriptor::NProcs(); - const int NSets = (NProcs + (nAtOnce - 1)) / nAtOnce; - const int MySet = MyProc/nAtOnce; - - Real piStart = 0, piEnd = 0, piStartAll = ParallelDescriptor::second(); - - for (int iSet = 0; iSet < NSets; ++iSet) - { - if (MySet == iSet) - { - // - // Call the pesky probin reader. - // - piStart = ParallelDescriptor::second(); - -#ifdef AMREX_DIMENSION_AGNOSTIC - - amrex_probinit(&a_init, - probin_file_name.dataPtr(), - &probin_file_length, - AMREX_ZFILL(Geom(0).ProbLo()), - AMREX_ZFILL(Geom(0).ProbHi())); - -#else - - amrex_probinit(&a_init, - probin_file_name.dataPtr(), - &probin_file_length, - Geom(0).ProbLo(), - Geom(0).ProbHi()); - -#endif - - piEnd = ParallelDescriptor::second(); - const int iBuff = 0; - const int wakeUpPID = (MyProc + nAtOnce); - const int tag = (MyProc % nAtOnce); - if (wakeUpPID < NProcs) - ParallelDescriptor::Send(&iBuff, 1, wakeUpPID, tag); - } - if (MySet == (iSet + 1)) - { - // - // Next set waits. - // - int iBuff; - int waitForPID = (MyProc - nAtOnce); - int tag = (MyProc % nAtOnce); - ParallelDescriptor::Recv(&iBuff, 1, waitForPID, tag); - } - } - - if (verbose > 1) - { - const int IOProc = ParallelDescriptor::IOProcessorNumber(); - Real piTotal = piEnd - piStart; - Real piTotalAll = ParallelDescriptor::second() - piStartAll; - - ParallelDescriptor::ReduceRealMax(piTotal, IOProc); - ParallelDescriptor::ReduceRealMax(piTotalAll, IOProc); - - amrex::Print() << "amrex_probinit max time = " << piTotal << '\n' - << "amrex_probinit total time = " << piTotalAll << '\n'; - } - - if (verbose > 0) - amrex::Print() << "Successfully run amrex_probinit\n"; -} - -void -Amr::initialInit (Real strt_time, - Real stop_time, - const BoxArray* lev0_grids, - const Vector* pmap) -{ - BL_PROFILE("Amr::initialInit()"); - InitializeInit(strt_time, stop_time, lev0_grids, pmap); - - // This is a subtlety, but in the case where we are initializing the data - // from a plotfile, we want to use the time read in from the plotfile as - // the start time instead of using "strt_time". - // The Amr data "cumtime" has been set in InitializeInit; if we are restarting - // from a plotfile, then cumtime must be re-defined in that initialization routine. - // Thus here we pass "cumtime" rather than "strt_time" to FinalizeInit. - FinalizeInit (cumtime, stop_time); -} - -void -Amr::InitializeInit(Real strt_time, - Real stop_time, - const BoxArray* lev0_grids, - const Vector* pmap) -{ - BL_PROFILE("Amr::InitializeInit()"); - BL_COMM_PROFILE_NAMETAG("Amr::InitializeInit TOP"); - if (check_input) checkInput(); - // - // Generate internal values from user-supplied values. - // - finest_level = 0; - // - // Init problem dependent data. - // - int linit = true; - - if (!probin_file.empty()) { - readProbinFile(linit); - } - - cumtime = strt_time; - // - // Define base level grids. Note that if we are restarting from a plotfile, this - // routine will call the level 0 AmrLevel initialization which will overwrite cumtime. - // - defBaseLevel(strt_time, lev0_grids, pmap); -} - -void -Amr::FinalizeInit (Real strt_time, - Real stop_time) -{ - BL_PROFILE("Amr::FinalizeInit()"); - // - // Compute dt and set time levels of all grid data. - // - amr_level[0]->computeInitialDt(finest_level, - sub_cycle, - n_cycle, - ref_ratio, - dt_level, - stop_time); - // - // The following was added for multifluid. - // - Real dt0 = dt_level[0]; - dt_min[0] = dt_level[0]; - n_cycle[0] = 1; - - for (int lev = 1; lev <= max_level; lev++) - { - dt0 /= n_cycle[lev]; - dt_level[lev] = dt0; - dt_min[lev] = dt_level[lev]; - } - - if (max_level > 0) - bldFineLevels(strt_time); - - for (int lev = 0; lev <= finest_level; lev++) - amr_level[lev]->setTimeLevel(strt_time,dt_level[lev],dt_level[lev]); - - for (int lev = 0; lev <= finest_level; lev++) - amr_level[lev]->post_regrid(0,finest_level); - - for (int lev = 0; lev <= finest_level; lev++) - { - level_steps[lev] = 0; - level_count[lev] = 0; - } - - // - // Perform any special post_initialization operations. - // - for(int lev(0); lev <= finest_level; ++lev) { - amr_level[lev]->post_init(stop_time); - } - - if (ParallelDescriptor::IOProcessor()) - { - if (verbose > 1) - { - std::cout << "INITIAL GRIDS \n"; - printGridInfo(std::cout,0,finest_level); - } - else if (verbose > 0) - { - std::cout << "INITIAL GRIDS \n"; - printGridSummary(std::cout,0,finest_level); - } - } - - if (record_grid_info && ParallelDescriptor::IOProcessor()) - { - gridlog << "INITIAL GRIDS \n"; - printGridInfo(gridlog,0,finest_level); - } - BL_COMM_PROFILE_NAMETAG("Amr::initialInit BOTTOM"); -} - -void -Amr::restart (const std::string& filename) -{ - BL_PROFILE_REGION_START("Amr::restart()"); - BL_PROFILE("Amr::restart()"); - - which_level_being_advanced = -1; - - Real dRestartTime0 = ParallelDescriptor::second(); - - VisMF::SetMFFileInStreams(mffile_nstreams); - - if (verbose > 0) { - amrex::Print() << "restarting calculation from file: " << filename << "\n"; - } - - if (record_run_info && ParallelDescriptor::IOProcessor()) { - runlog << "RESTART from file = " << filename << '\n'; - } - // - // Init problem dependent data. - // - int linit = false; - - readProbinFile(linit); - // - // Start calculation from given restart file. - // - if (record_run_info && ParallelDescriptor::IOProcessor()) { - runlog << "RESTART from file = " << filename << '\n'; - } - - // ---- preread and broadcast all FabArray headers if this file exists - std::map > faHeaderMap; - if(prereadFAHeaders) { - // ---- broadcast the file with the names of the fabarray headers - std::string faHeaderFilesName(filename + "/FabArrayHeaders.txt"); - Vector faHeaderFileChars; - bool bExitOnError(false); // ---- dont exit if this file does not exist - ParallelDescriptor::ReadAndBcastFile(faHeaderFilesName, faHeaderFileChars, - bExitOnError); - if(faHeaderFileChars.size() > 0) { // ---- headers were read - std::string faFileCharPtrString(faHeaderFileChars.dataPtr()); - std::istringstream fais(faFileCharPtrString, std::istringstream::in); - while ( ! fais.eof()) { // ---- read and broadcast each header - std::string faHeaderName; - fais >> faHeaderName; - if( ! fais.eof()) { - std::string faHeaderFullName(filename + '/' + faHeaderName + "_H"); - Vector &tempCharArray = faHeaderMap[faHeaderFullName]; - ParallelDescriptor::ReadAndBcastFile(faHeaderFullName, tempCharArray); - if(verbose > 2) { - amrex::Print() - << ":::: faHeaderName faHeaderFullName tempCharArray.size() = " << faHeaderName - << " " << faHeaderFullName << " " << tempCharArray.size() << "\n"; - } - } - } - StateData::SetFAHeaderMapPtr(&faHeaderMap); - } - } - - // - // Open the checkpoint header file for reading. - // - std::string File(filename + "/Header"); - - VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize()); - - Vector fileCharPtr; - ParallelDescriptor::ReadAndBcastFile(File, fileCharPtr); - std::string fileCharPtrString(fileCharPtr.dataPtr()); - std::istringstream is(fileCharPtrString, std::istringstream::in); - // - // Read global data. - // - // Attempt to differentiate between old and new CheckPointFiles. - // - int spdim; - bool new_checkpoint_format = false; - std::string first_line; - - std::getline(is,first_line); - - if (first_line == CheckPointVersion) - { - new_checkpoint_format = true; - is >> spdim; - } - else - { - spdim = atoi(first_line.c_str()); - } - - if (spdim != BL_SPACEDIM) - { - std::cerr << "Amr::restart(): bad spacedim = " << spdim << '\n'; - amrex::Abort(); - } - - is >> cumtime; - int mx_lev; - is >> mx_lev; - is >> finest_level; - - Vector inputs_domain(max_level+1); - for (int lev = 0; lev <= max_level; ++lev) - { - Box bx(Geom(lev).Domain().smallEnd(),Geom(lev).Domain().bigEnd()); - inputs_domain[lev] = bx; - } - - if (max_level >= mx_lev) { - - for (int i(0); i <= mx_lev; ++i) { is >> Geom(i); } - for (int i(0); i < mx_lev; ++i) { is >> ref_ratio[i]; } - for (int i(0); i <= mx_lev; ++i) { is >> dt_level[i]; } - - if (new_checkpoint_format) - { - for (int i(0); i <= mx_lev; ++i) { is >> dt_min[i]; } - } - else - { - for (int i(0); i <= mx_lev; ++i) { dt_min[i] = dt_level[i]; } - } - - Vector n_cycle_in; - n_cycle_in.resize(mx_lev+1); - for (int i(0); i <= mx_lev; ++i) { is >> n_cycle_in[i]; } - bool any_changed = false; - - for (int i(0); i <= mx_lev; ++i) { - if (n_cycle[i] != n_cycle_in[i]) { - any_changed = true; - if (verbose > 0) { - amrex::Print() << "Warning: n_cycle has changed at level " << i << - " from " << n_cycle_in[i] << " to " << n_cycle[i] << "\n"; - } - } - } - - // If we change n_cycle then force a full regrid from level 0 up - if (max_level > 0 && any_changed) - { - level_count[0] = regrid_int[0]; - if (verbose > 0) { - amrex::Print() << "Warning: This forces a full regrid \n"; - } - } - - - for (int i(0); i <= mx_lev; ++i) { is >> level_steps[i]; } - for (int i(0); i <= mx_lev; ++i) { is >> level_count[i]; } - - // - // Set bndry conditions. - // - if (max_level > mx_lev) - { - for (int i(mx_lev + 1); i <= max_level; ++i) - { - dt_level[i] = dt_level[i-1]/n_cycle[i]; - level_steps[i] = n_cycle[i]*level_steps[i-1]; - level_count[i] = 0; - } - - // This is just an error check - if ( ! sub_cycle) - { - for (int i(1); i <= finest_level; ++i) - { - if (dt_level[i] != dt_level[i-1]) { - amrex::Error("restart: must have same dt at all levels if not subcycling"); - } - } - } - } - - if (regrid_on_restart && max_level > 0) - { - if (regrid_int[0] > 0) { - level_count[0] = regrid_int[0]; - } else { - amrex::Error("restart: can't have regrid_on_restart and regrid_int <= 0"); - } - } - - checkInput(); - // - // Read levels. - // - for (int lev(0); lev <= finest_level; ++lev) - { - amr_level[lev].reset((*levelbld)()); - amr_level[lev]->restart(*this, is); - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); - } - // - // Build any additional data structures. - // - for (int lev = 0; lev <= finest_level; lev++) { - amr_level[lev]->post_restart(); - } - - } else { - - if (ParallelDescriptor::IOProcessor()) { - amrex::Warning("Amr::restart(): max_level is lower than before"); - } - - int new_finest_level = std::min(max_level,finest_level); - - finest_level = new_finest_level; - - // These are just used to hold the extra stuff we have to read in. - Geometry geom_dummy; - Real real_dummy; - int int_dummy; - IntVect intvect_dummy; - - for (int i(0) ; i <= max_level; ++i) { is >> Geom(i); } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> geom_dummy; } - - for (int i(0) ; i < max_level; ++i) { is >> ref_ratio[i]; } - for (int i(max_level); i < mx_lev ; ++i) { is >> intvect_dummy; } - - for (int i(0) ; i <= max_level; ++i) { is >> dt_level[i]; } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> real_dummy; } - - if (new_checkpoint_format) { - for (int i(0) ; i <= max_level; ++i) { is >> dt_min[i]; } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> real_dummy; } - } else { - for (int i(0); i <= max_level; ++i) { dt_min[i] = dt_level[i]; } - } - - for (int i(0) ; i <= max_level; ++i) { is >> n_cycle[i]; } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> int_dummy; } - - for (int i(0) ; i <= max_level; ++i) { is >> level_steps[i]; } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> int_dummy; } - - for (int i(0) ; i <= max_level; ++i) { is >> level_count[i]; } - for (int i(max_level + 1); i <= mx_lev ; ++i) { is >> int_dummy; } - - if (regrid_on_restart && max_level > 0) { - if (regrid_int[0] > 0) { - level_count[0] = regrid_int[0]; - } else { - amrex::Error("restart: can't have regrid_on_restart and regrid_int <= 0"); - } - } - - checkInput(); - - // - // Read levels. - // - for (int lev = 0; lev <= new_finest_level; lev++) - { - amr_level[lev].reset((*levelbld)()); - amr_level[lev]->restart(*this, is); - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); - } - // - // Build any additional data structures. - // - for (int lev = 0; lev <= new_finest_level; lev++) { - amr_level[lev]->post_restart(); - } - } - - for (int lev = 0; lev <= finest_level; ++lev) - { - Box restart_domain(Geom(lev).Domain()); - if ( ! (inputs_domain[lev] == restart_domain) ) - { - amrex::Print() - << "Problem at level " << lev << '\n' - << "Domain according to inputs file is " << inputs_domain[lev] << '\n' - << "Domain according to checkpoint file is " << restart_domain << '\n' - << "Amr::restart() failed -- box from inputs file does not " - << "equal box from restart file. \n"; - amrex::Abort(); - } - } - - if (verbose > 0) - { - Real dRestartTime = ParallelDescriptor::second() - dRestartTime0; - - ParallelDescriptor::ReduceRealMax(dRestartTime,ParallelDescriptor::IOProcessorNumber()); - - amrex::Print() << "Restart time = " << dRestartTime << " seconds." << '\n'; - } - BL_PROFILE_REGION_STOP("Amr::restart()"); -} - -void -Amr::checkPoint () -{ - if( ! checkpoint_files_output) { - return; - } - - BL_PROFILE_REGION_START("Amr::checkPoint()"); - BL_PROFILE("Amr::checkPoint()"); - - VisMF::SetNOutFiles(checkpoint_nfiles); - // - // In checkpoint files always write out FABs in NATIVE format. - // - FABio::Format thePrevFormat = FArrayBox::getFormat(); - - FArrayBox::setFormat(FABio::FAB_NATIVE); - - VisMF::Header::Version currentVersion(VisMF::GetHeaderVersion()); - VisMF::SetHeaderVersion(checkpoint_headerversion); - - Real dCheckPointTime0 = ParallelDescriptor::second(); - - const std::string& ckfile = amrex::Concatenate(check_file_root,level_steps[0],file_name_digits); - - if(verbose > 0) { - amrex::Print() << "CHECKPOINT: file = " << ckfile << "\n"; - } - - if(record_run_info && ParallelDescriptor::IOProcessor()) { - runlog << "CHECKPOINT: file = " << ckfile << '\n'; - } - - - amrex::StreamRetry sretry(ckfile, abort_on_stream_retry_failure, - stream_max_tries); - - const std::string ckfileTemp(ckfile + ".temp"); - - while(sretry.TryFileOutput()) { - - StateData::ClearFabArrayHeaderNames(); - - // - // if either the ckfile or ckfileTemp exists, rename them - // to move them out of the way. then create ckfile - // with the temporary name, then rename it back when - // it is finished writing. then stream retry can rename - // it to a bad suffix if there were stream errors. - // - - if(precreateDirectories) { // ---- make all directories at once - amrex::UtilRenameDirectoryToOld(ckfile, false); // dont call barrier - amrex::UtilCreateCleanDirectory(ckfileTemp, false); // dont call barrier - for(int i(0); i <= finest_level; ++i) { - amr_level[i]->CreateLevelDirectory(ckfileTemp); - } - ParallelDescriptor::Barrier("Amr::precreateDirectories"); - } else { - amrex::UtilRenameDirectoryToOld(ckfile, false); // dont call barrier - amrex::UtilCreateCleanDirectory(ckfileTemp, true); // call barrier - } - - std::string HeaderFileName = ckfileTemp + "/Header"; - - VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize()); - - std::ofstream HeaderFile; - - HeaderFile.rdbuf()->pubsetbuf(io_buffer.dataPtr(), io_buffer.size()); - - int old_prec = 0; - - if (ParallelDescriptor::IOProcessor()) - { - // - // Only the IOProcessor() writes to the header file. - // - HeaderFile.open(HeaderFileName.c_str(), std::ios::out | std::ios::trunc | - std::ios::binary); - - if ( ! HeaderFile.good()) { - amrex::FileOpenFailed(HeaderFileName); - } - - old_prec = HeaderFile.precision(17); - - HeaderFile << CheckPointVersion << '\n' - << BL_SPACEDIM << '\n' - << cumtime << '\n' - << max_level << '\n' - << finest_level << '\n'; - // - // Write out problem domain. - // - for (int i(0); i <= max_level; ++i) { HeaderFile << Geom(i) << ' '; } - HeaderFile << '\n'; - for (int i(0); i < max_level; ++i) { HeaderFile << ref_ratio[i] << ' '; } - HeaderFile << '\n'; - for (int i(0); i <= max_level; ++i) { HeaderFile << dt_level[i] << ' '; } - HeaderFile << '\n'; - for (int i(0); i <= max_level; ++i) { HeaderFile << dt_min[i] << ' '; } - HeaderFile << '\n'; - for (int i(0); i <= max_level; ++i) { HeaderFile << n_cycle[i] << ' '; } - HeaderFile << '\n'; - for (int i(0); i <= max_level; ++i) { HeaderFile << level_steps[i] << ' '; } - HeaderFile << '\n'; - for (int i(0); i <= max_level; ++i) { HeaderFile << level_count[i] << ' '; } - HeaderFile << '\n'; - } - - for (int i = 0; i <= finest_level; ++i) { - amr_level[i]->checkPoint(ckfileTemp, HeaderFile); - } - - if (ParallelDescriptor::IOProcessor()) { - const Vector &FAHeaderNames = StateData::FabArrayHeaderNames(); - if(FAHeaderNames.size() > 0) { - std::string FAHeaderFilesName = ckfileTemp + "/FabArrayHeaders.txt"; - std::ofstream FAHeaderFile(FAHeaderFilesName.c_str(), - std::ios::out | std::ios::trunc | - std::ios::binary); - if ( ! FAHeaderFile.good()) { - amrex::FileOpenFailed(FAHeaderFilesName); - } - - for(int i(0); i < FAHeaderNames.size(); ++i) { - FAHeaderFile << FAHeaderNames[i] << '\n'; - } - } - } - - if(ParallelDescriptor::IOProcessor()) { - HeaderFile.precision(old_prec); - - if( ! HeaderFile.good()) { - amrex::Error("Amr::checkpoint() failed"); - } - } - - last_checkpoint = level_steps[0]; - - if (verbose > 0) - { - Real dCheckPointTime = ParallelDescriptor::second() - dCheckPointTime0; - - ParallelDescriptor::ReduceRealMax(dCheckPointTime, - ParallelDescriptor::IOProcessorNumber()); - - amrex::Print() << "checkPoint() time = " << dCheckPointTime << " secs." << '\n'; - } - ParallelDescriptor::Barrier("Amr::checkPoint::end"); - - if(ParallelDescriptor::IOProcessor()) { - std::rename(ckfileTemp.c_str(), ckfile.c_str()); - } - ParallelDescriptor::Barrier("Renaming temporary checkPoint file."); - - } // end while - - // - // Restore the previous FAB format. - // - FArrayBox::setFormat(thePrevFormat); - - VisMF::SetHeaderVersion(currentVersion); - - BL_PROFILE_REGION_STOP("Amr::checkPoint()"); -} - -void -Amr::RegridOnly (Real time) -{ - BL_ASSERT(regrid_on_restart == 1); - - int lev_top = std::min(finest_level, max_level-1); - - for (int i = 0; i <= lev_top; i++) - regrid(i,time); - - if (plotfile_on_restart) - writePlotFile(); - - if (checkpoint_on_restart) - checkPoint(); - -} - -void -Amr::timeStep (int level, - Real time, - int iteration, - int niter, - Real stop_time) -{ - BL_PROFILE("Amr::timeStep()"); - BL_COMM_PROFILE_NAMETAG("Amr::timeStep TOP"); - - // This is used so that the AmrLevel functions can know which level is being advanced - // when regridding is called with possible lbase > level. - which_level_being_advanced = level; - - // Update so that by default, we don't force a post-step regrid. - amr_level[level]->setPostStepRegrid(0); - - // - // Allow regridding of level 0 calculation on restart. - // - if (max_level == 0 && regrid_on_restart) - { - regrid_level_0_on_restart(); - } - else - { - int lev_top = std::min(finest_level, max_level-1); - - for (int i(level); i <= lev_top; ++i) - { - const int old_finest = finest_level; - - if (okToRegrid(i)) - { - regrid(i,time); - - // - // Compute new dt after regrid if at level 0 and compute_new_dt_on_regrid. - // - if ( compute_new_dt_on_regrid && (i == 0) ) - { - int post_regrid_flag = 1; - amr_level[0]->computeNewDt(finest_level, - sub_cycle, - n_cycle, - ref_ratio, - dt_min, - dt_level, - stop_time, - post_regrid_flag); - } - - for (int k(i); k <= finest_level; ++k) { - level_count[k] = 0; - } - - if (old_finest < finest_level) - { - // - // The new levels will not have valid time steps - // and iteration counts. - // - for (int k(old_finest + 1); k <= finest_level; ++k) - { - dt_level[k] = dt_level[k-1]/n_cycle[k]; - } - } - } - if (old_finest > finest_level) { - lev_top = std::min(finest_level, max_level - 1); - } - } - - if (max_level == 0 && loadbalance_level0_int > 0 && loadbalance_with_workestimates) - { - if (level_steps[0] == 1 || level_count[0] >= loadbalance_level0_int) { - LoadBalanceLevel0(time); - level_count[0] = 0; - } - } - } - // - // Check to see if should write plotfile. - // This routine is here so it is done after the restart regrid. - // - if (plotfile_on_restart && ! (restart_chkfile.empty()) ) - { - plotfile_on_restart = 0; - writePlotFile(); - } - // - // Advance grids at this level. - // - if (verbose > 0) - { - amrex::Print() << "[Level " << level << " step " << level_steps[level]+1 << "] " - << "ADVANCE with dt = " << dt_level[level] << "\n"; - } - BL_PROFILE_REGION_START("amr_level.advance"); - Real dt_new = amr_level[level]->advance(time,dt_level[level],iteration,niter); - BL_PROFILE_REGION_STOP("amr_level.advance"); - - dt_min[level] = iteration == 1 ? dt_new : std::min(dt_min[level],dt_new); - - level_steps[level]++; - level_count[level]++; - - if (verbose > 0) - { - amrex::Print() << "[Level " << level << " step " << level_steps[level] << "] " - << "Advanced " << amr_level[level]->countCells() << " cells\n"; - } - - // If the level signified that it wants a regrid after the advance has - // occurred, do that now. - if (amr_level[level]->postStepRegrid()) { - - int old_finest = finest_level; - - regrid(level, time); - - if (old_finest < finest_level) - { - // - // The new levels will not have valid time steps. - // - for (int k = old_finest + 1; k <= finest_level; ++k) - { - dt_level[k] = dt_level[k-1] / n_cycle[k]; - } - } - - } - - // - // Advance grids at higher level. - // - if (level < finest_level) - { - const int lev_fine = level+1; - - if (sub_cycle) - { - const int ncycle = n_cycle[lev_fine]; - - BL_COMM_PROFILE_NAMETAG("Amr::timeStep timeStep subcycle"); - for (int i = 1; i <= ncycle; i++) - timeStep(lev_fine,time+(i-1)*dt_level[lev_fine],i,ncycle,stop_time); - } - else - { - BL_COMM_PROFILE_NAMETAG("Amr::timeStep timeStep nosubcycle"); - timeStep(lev_fine,time,1,1,stop_time); - } - } - - amr_level[level]->post_timestep(iteration); - - // Set this back to negative so we know whether we are in fact in this routine - which_level_being_advanced = -1; -} - -Real -Amr::coarseTimeStepDt (Real stop_time) -{ - coarseTimeStep(stop_time); - return dt_level[0]; -} - -void -Amr::coarseTimeStep (Real stop_time) -{ - BL_PROFILE_REGION_START("Amr::coarseTimeStep()"); - BL_PROFILE("Amr::coarseTimeStep()"); - std::stringstream stepName; - stepName << "timeStep STEP " << level_steps[0]; - - const Real run_strt = ParallelDescriptor::second() ; - - // - // Compute new dt. - // - if (levelSteps(0) > 0) - { - int post_regrid_flag = 0; - amr_level[0]->computeNewDt(finest_level, - sub_cycle, - n_cycle, - ref_ratio, - dt_min, - dt_level, - stop_time, - post_regrid_flag); - } - else - { - amr_level[0]->computeInitialDt(finest_level, - sub_cycle, - n_cycle, - ref_ratio, - dt_level, - stop_time); - } - - BL_PROFILE_REGION_START(stepName.str()); - - timeStep(0,cumtime,1,1,stop_time); - - BL_PROFILE_REGION_STOP(stepName.str()); - - cumtime += dt_level[0]; - - amr_level[0]->postCoarseTimeStep(cumtime); - - if (verbose > 0) - { - const int IOProc = ParallelDescriptor::IOProcessorNumber(); - Real run_stop = ParallelDescriptor::second() - run_strt; - const int istep = level_steps[0]; - -#ifdef BL_LAZY - Lazy::QueueReduction( [=] () mutable { -#endif - ParallelDescriptor::ReduceRealMax(run_stop,IOProc); - amrex::Print() << "\n[STEP " << istep << "] Coarse TimeStep time: " << run_stop << '\n'; -#ifdef BL_LAZY - }); -#endif - -#ifndef AMREX_MEM_PROFILING - long min_fab_kilobytes = amrex::TotalBytesAllocatedInFabsHWM()/1024; - long max_fab_kilobytes = min_fab_kilobytes; - -#ifdef BL_LAZY - Lazy::QueueReduction( [=] () mutable { -#endif - ParallelDescriptor::ReduceLongMin(min_fab_kilobytes, IOProc); - ParallelDescriptor::ReduceLongMax(max_fab_kilobytes, IOProc); - - amrex::Print() << "[STEP " << istep << "] FAB kilobyte spread across MPI nodes: [" - << min_fab_kilobytes << " ... " << max_fab_kilobytes << "]\n"; -#ifdef BL_LAZY - amrex::Print() << "\n"; - }); -#endif -#endif - } - -#ifdef AMREX_MEM_PROFILING - { - std::ostringstream ss; - ss << "[STEP " << level_steps[0] << "]"; - MemProfiler::report(ss.str()); - } -#endif - - BL_PROFILE_ADD_STEP(level_steps[0]); - BL_PROFILE_REGION_STOP("Amr::coarseTimeStep()"); - BL_TRACE_PROFILE_FLUSH(); - BL_COMM_PROFILE_NAMETAG(stepName.str()); - BL_COMM_PROFILE_FLUSH(); - - if (verbose > 0) - { - amrex::Print() - << "\nSTEP = " << level_steps[0] - << " TIME = " << cumtime - << " DT = " << dt_level[0] << "\n\n"; - } - if (record_run_info && ParallelDescriptor::IOProcessor()) - { - runlog << "STEP = " << level_steps[0] - << " TIME = " << cumtime - << " DT = " << dt_level[0] << '\n'; - } - if (record_run_info_terse && ParallelDescriptor::IOProcessor()) - runlog_terse << level_steps[0] << " " << cumtime << " " << dt_level[0] << '\n'; - - int check_test = 0; - - if (check_per > 0.0) - { - const int num_per_old = (cumtime-dt_level[0]) / check_per; - const int num_per_new = (cumtime ) / check_per; - - if (num_per_old != num_per_new) - { - check_test = 1; - } - } - - int to_stop = 0; - int to_checkpoint = 0; - int to_plot = 0; - int to_small_plot = 0; - if (message_int > 0 && level_steps[0] % message_int == 0) { - if (ParallelDescriptor::IOProcessor()) - { - FILE *fp; - if ((fp=fopen("dump_and_continue","r")) != 0) - { - remove("dump_and_continue"); - to_checkpoint = 1; - fclose(fp); - } - else if ((fp=fopen("stop_run","r")) != 0) - { - remove("stop_run"); - to_stop = 1; - fclose(fp); - } - else if ((fp=fopen("dump_and_stop","r")) != 0) - { - remove("dump_and_stop"); - to_checkpoint = 1; - to_stop = 1; - fclose(fp); - } - - if ((fp=fopen("plot_and_continue","r")) != 0) - { - remove("plot_and_continue"); - to_plot = 1; - fclose(fp); - } - - if ((fp=fopen("small_plot_and_continue","r")) != 0) - { - remove("small_plot_and_continue"); - to_small_plot = 1; - fclose(fp); - } - } - int packed_data[4]; - packed_data[0] = to_stop; - packed_data[1] = to_checkpoint; - packed_data[2] = to_plot; - packed_data[3] = to_small_plot; - ParallelDescriptor::Bcast(packed_data, 4, ParallelDescriptor::IOProcessorNumber()); - to_stop = packed_data[0]; - to_checkpoint = packed_data[1]; - to_plot = packed_data[2]; - to_small_plot = packed_data[3]; - - } - - if(to_stop == 1 && to_checkpoint == 0) { // prevent main from writing files - last_checkpoint = level_steps[0]; - last_plotfile = level_steps[0]; - } - - if (to_checkpoint && write_plotfile_with_checkpoint) - to_plot = 1; - - if ((check_int > 0 && level_steps[0] % check_int == 0) || check_test == 1 - || to_checkpoint) - { - checkPoint(); - } - - - if (writePlotNow() || to_plot) - { - writePlotFile(); - } - - if (writeSmallPlotNow() || to_small_plot) - { - writeSmallPlotFile(); - } - - bUserStopRequest = to_stop; - if (to_stop) - { - ParallelDescriptor::Barrier("Amr::coarseTimeStep::to_stop"); - if(ParallelDescriptor::IOProcessor()) { - if (to_checkpoint) - { - std::cerr << "Stopped by user w/ checkpoint" << std::endl; - } - else - { - std::cerr << "Stopped by user w/o checkpoint" << std::endl; - } - } - } -} - -bool -Amr::writePlotNow() -{ - int plot_test = 0; - if (plot_per > 0.0) - { - const int num_per_old = (cumtime-dt_level[0]) / plot_per; - const int num_per_new = (cumtime ) / plot_per; - - if (num_per_old != num_per_new) - { - plot_test = 1; - } - } - - return ( (plot_int > 0 && level_steps[0] % plot_int == 0) || - plot_test == 1 || - amr_level[0]->writePlotNow()); -} - -bool -Amr::writeSmallPlotNow() -{ - int plot_test = 0; - if (small_plot_per > 0.0) - { - const int num_per_old = (cumtime-dt_level[0]) / small_plot_per; - const int num_per_new = (cumtime ) / small_plot_per; - - if (num_per_old != num_per_new) - { - plot_test = 1; - } - } - - return ( (small_plot_int > 0 && level_steps[0] % small_plot_int == 0) || - plot_test == 1 || - amr_level[0]->writeSmallPlotNow()); -} - -void -Amr::defBaseLevel (Real strt_time, - const BoxArray* lev0_grids, - const Vector* pmap) -{ - BL_PROFILE("Amr::defBaseLevel()"); - // Just initialize this here for the heck of it - which_level_being_advanced = -1; - - // - // Check that base domain has even number of zones in all directions. - // - const Box& domain = Geom(0).Domain(); - const IntVect& d_len = domain.size(); - - for (int idir = 0; idir < BL_SPACEDIM; idir++) - if (d_len[idir]%2 != 0) - amrex::Error("defBaseLevel: must have even number of cells"); - - BoxArray lev0; - - if (lev0_grids != 0 && lev0_grids->size() > 0) - { - BL_ASSERT(pmap != 0); - - BoxArray domain_ba(domain); - if (!domain_ba.contains(*lev0_grids)) - amrex::Error("defBaseLevel: domain does not contain lev0_grids!"); - if (!lev0_grids->contains(domain_ba)) - amrex::Error("defBaseLevel: lev0_grids does not contain domain"); - - lev0 = *lev0_grids; - - if (refine_grid_layout) { - ChopGrids(0,lev0,ParallelDescriptor::NProcs()); - } - } - else - { - lev0 = MakeBaseGrids(); - } - - this->SetBoxArray(0, lev0); - this->SetDistributionMap(0, DistributionMapping(lev0)); - - // - // Now build level 0 grids. - // - amr_level[0].reset((*levelbld)(*this,0,Geom(0),grids[0],dmap[0],strt_time)); - // - // Now init level 0 grids with data. - // - amr_level[0]->initData(); -} - -void -Amr::regrid (int lbase, - Real time, - bool initial) -{ - BL_PROFILE("Amr::regrid()"); - - if (lbase > std::min(finest_level,max_level-1)) return; - - if (verbose > 0) - amrex::Print() << "Now regridding at level lbase = " << lbase << "\n"; - - // - // Compute positions of new grids. - // - int new_finest; - Vector new_grid_places(max_level+1); - Vector new_dmap(max_level+1); - - grid_places(lbase,time,new_finest, new_grid_places); - - bool regrid_level_zero = (!initial) && (lbase == 0) - && ( loadbalance_with_workestimates || (new_grid_places[0] != amr_level[0]->boxArray())); - - const int start = regrid_level_zero ? 0 : lbase+1; - - bool grids_unchanged = finest_level == new_finest; - for (int lev = start, End = std::min(finest_level,new_finest); lev <= End; lev++) { - if (new_grid_places[lev] == amr_level[lev]->boxArray()) { - new_grid_places[lev] = amr_level[lev]->boxArray(); // to avoid duplicates - new_dmap[lev] = amr_level[lev]->DistributionMap(); - } else { - grids_unchanged = false; - } - } - - // - // If use_efficient_regrid flag is set and grids are unchanged, then don't do anything more here. - // - if (use_efficient_regrid == 1 && grids_unchanged ) - { - if (verbose > 0) { - amrex::Print() << "Regridding at level lbase = " << lbase - << " but grids unchanged\n"; - } - return; - } - - // - // Reclaim old-time grid space for all remain levels > lbase. - // - for(int lev = start; lev <= finest_level; ++lev) { - amr_level[lev]->removeOldData(); - } - // - // Reclaim all remaining storage for levels > new_finest. - // - for(int lev = new_finest + 1; lev <= finest_level; ++lev) { - amr_level[lev].reset(); - this->ClearBoxArray(lev); - this->ClearDistributionMap(lev); - } - - finest_level = new_finest; - - // - // Define the new grids from level start up to new_finest. - // - for(int lev = start; lev <= new_finest; ++lev) { - // - // Construct skeleton of new level. - // - - if (loadbalance_with_workestimates && !initial) { - new_dmap[lev] = makeLoadBalanceDistributionMap(lev, time, new_grid_places[lev]); - } - else if (new_dmap[lev].empty()) { - new_dmap[lev].define(new_grid_places[lev]); - } - - AmrLevel* a = (*levelbld)(*this,lev,Geom(lev),new_grid_places[lev], - new_dmap[lev],cumtime); - - if (initial) - { - // - // We're being called on startup from bldFineLevels(). - // NOTE: The initData function may use a filPatch, and so needs to - // be officially inserted into the hierarchy prior to the call. - // - amr_level[lev].reset(a); - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); - amr_level[lev]->initData(); - } - else if (amr_level[lev]) - { - // - // Init with data from old structure then remove old structure. - // NOTE: The init function may use a filPatch from the old level, - // which therefore needs remain in the hierarchy during the call. - // - a->init(*amr_level[lev]); - amr_level[lev].reset(a); - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); - } - else - { - a->init(); - amr_level[lev].reset(a); - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); - } - - } - - - // - // Check at *all* levels whether we need to do anything special now that the grids - // at levels lbase+1 and higher may have changed. - // - for(int lev(0); lev <= new_finest; ++lev) { - amr_level[lev]->post_regrid(lbase,new_finest); - } - - // - // Report creation of new grids. - // - - if (record_run_info && ParallelDescriptor::IOProcessor()) - { - runlog << "REGRID: at level lbase = " << lbase << '\n'; - printGridInfo(runlog,start,finest_level); - } - - if (record_grid_info && ParallelDescriptor::IOProcessor()) - { - if (lbase == 0) - gridlog << "STEP = " << level_steps[0] << ' '; - - gridlog << "TIME = " - << time - << " : REGRID with lbase = " - << lbase - << '\n'; - - printGridInfo(gridlog,start,finest_level); - } - - if (verbose > 0 && ParallelDescriptor::IOProcessor()) - { - if (lbase == 0) - std::cout << "STEP = " << level_steps[0] << ' '; - - std::cout << "TIME = " - << time - << " : REGRID with lbase = " - << lbase - << std::endl; - - if (verbose > 1) - { - printGridInfo(std::cout,start,finest_level); - } - else - { - printGridSummary(std::cout,start,finest_level); - } - } -} - -DistributionMapping -Amr::makeLoadBalanceDistributionMap (int lev, Real time, const BoxArray& ba) const -{ - BL_PROFILE("makeLoadBalanceDistributionMap()"); - - amrex::Print() << "Load balance on level " << lev << " at t = " << time << "\n"; - - DistributionMapping newdm; - - const int work_est_type = amr_level[0]->WorkEstType(); - - if (work_est_type < 0) { - amrex::Print() << "\nAMREX WARNING: work estimates type does not exist!\n\n"; - newdm.define(ba); - } - else if (amr_level[lev]) - { - DistributionMapping dmtmp; - if (ba.size() == boxArray(lev).size()) { - dmtmp = DistributionMap(lev); - } else { - dmtmp.define(ba); - } - - MultiFab workest(ba, dmtmp, 1, 0); - AmrLevel::FillPatch(*amr_level[lev], workest, 0, time, work_est_type, 0, 1, 0); - - Real navg = static_cast(ba.size()) / static_cast(ParallelDescriptor::NProcs()); - int nmax = std::max(std::round(loadbalance_max_fac*navg), std::ceil(navg)); - - newdm = DistributionMapping::makeKnapSack(workest, nmax); - } - else - { - newdm.define(ba); - } - - return newdm; -} - -void -Amr::LoadBalanceLevel0 (Real time) -{ - BL_PROFILE("LoadBalanceLevel0()"); - const auto& dm = makeLoadBalanceDistributionMap(0, time, boxArray(0)); - InstallNewDistributionMap(0, dm); - amr_level[0]->post_regrid(0,time); -} - -void -Amr::InstallNewDistributionMap (int lev, const DistributionMapping& newdm) -{ - BL_PROFILE("InstallNewDistributionMap()"); - - AmrLevel* a = (*levelbld)(*this,lev,Geom(lev),boxArray(lev),newdm,cumtime); - a->init(*amr_level[lev]); - amr_level[lev].reset(a); - - this->SetBoxArray(lev, amr_level[lev]->boxArray()); - this->SetDistributionMap(lev, amr_level[lev]->DistributionMap()); -} - -void -Amr::regrid_level_0_on_restart() -{ - regrid_on_restart = 0; - // - // Coarsening before we split the grids ensures that each resulting - // grid will have an even number of cells in each direction. - // - BoxArray lev0(amrex::coarsen(Geom(0).Domain(),2)); - // - // Now split up into list of grids within max_grid_size[0] limit. - // - lev0.maxSize(max_grid_size[0]/2); - // - // Now refine these boxes back to level 0. - // - lev0.refine(2); - - // - // If use_efficient_regrid flag is set, then test to see whether we in fact - // have just changed the level 0 grids. If not, then don't do anything more here. - // - if ( !( (use_efficient_regrid == 1) && (lev0 == amr_level[0]->boxArray()) ) ) - { - // - // Construct skeleton of new level. - // - DistributionMapping dm(lev0); - AmrLevel* a = (*levelbld)(*this,0,Geom(0),lev0,dm,cumtime); - - a->init(*amr_level[0]); - amr_level[0].reset(a); - - this->SetBoxArray(0, amr_level[0]->boxArray()); - this->SetDistributionMap(0, amr_level[0]->DistributionMap()); - - amr_level[0]->post_regrid(0,0); - - if (ParallelDescriptor::IOProcessor()) - { - if (verbose > 1) - { - printGridInfo(std::cout,0,finest_level); - } - else if (verbose > 0) - { - printGridSummary(std::cout,0,finest_level); - } - } - - if (record_grid_info && ParallelDescriptor::IOProcessor()) - printGridInfo(gridlog,0,finest_level); - } - else - { - if (verbose > 0) - amrex::Print() << "Regridding at level 0 but grids unchanged \n"; - } -} - -void -Amr::printGridInfo (std::ostream& os, - int min_lev, - int max_lev) -{ - for (int lev = min_lev; lev <= max_lev; lev++) - { - const BoxArray& bs = amr_level[lev]->boxArray(); - int numgrid = bs.size(); - long ncells = amr_level[lev]->countCells(); - double ntot = Geom(lev).Domain().d_numPts(); - Real frac = 100.0_rt*(Real(ncells) / ntot); - const DistributionMapping& map = amr_level[lev]->get_new_data(0).DistributionMap(); - - os << " Level " - << lev - << " " - << numgrid - << " grids " - << ncells - << " cells " - << frac - << " % of domain" - << '\n'; - - - for (int k = 0; k < numgrid; k++) - { - const Box& b = bs[k]; - - os << ' ' << lev << ": " << b << " "; - - for (int i = 0; i < BL_SPACEDIM; i++) - os << b.length(i) << ' '; - - os << ":: " << map[k] << '\n'; - } - } - - os << std::endl; // Make sure we flush! -} - - -void -Amr::grid_places (int lbase, - Real time, - int& new_finest, - Vector& new_grids) -{ - BL_PROFILE("Amr::grid_places()"); - - const Real strttime = ParallelDescriptor::second(); - - if (lbase == 0) - { - new_grids[0] = MakeBaseGrids(); - } - - if ( time == 0. && !initial_grids_file.empty() && !use_fixed_coarse_grids) - { - new_finest = std::min(max_level,(finest_level+1)); - new_finest = std::min(new_finest,initial_ba.size()); - - for (int lev = 1; lev <= new_finest; lev++) - { - BoxList bl; - int ngrid = initial_ba[lev-1].size(); - for (int i = 0; i < ngrid; i++) - { - Box bx(initial_ba[lev-1][i]); - if (lev > lbase) - bl.push_back(bx); - } - if (lev > lbase) - new_grids[lev].define(bl); - } - return; - } - - // Use grids in initial_grids_file as fixed coarse grids. - if ( ! initial_grids_file.empty() && use_fixed_coarse_grids) - { - new_finest = std::min(max_level,(finest_level+1)); - new_finest = std::min(new_finest,initial_ba.size()); - - for (int lev = lbase+1; lev <= new_finest; lev++) - { - BoxList bl; - int ngrid = initial_ba[lev-1].size(); - for (int i = 0; i < ngrid; i++) - { - Box bx(initial_ba[lev-1][i]); - - if (lev > lbase) - bl.push_back(bx); - - } - if (lev > lbase) - new_grids[lev].define(bl); - new_grids[lev].maxSize(max_grid_size[lev]); - } - } - else if ( !regrid_grids_file.empty() ) // Use grids in regrid_grids_file - { - new_finest = std::min(max_level,(finest_level+1)); - new_finest = std::min(new_finest,regrid_ba.size()); - for (int lev = 1; lev <= new_finest; lev++) - { - BoxList bl; - int ngrid = regrid_ba[lev-1].size(); - for (int i = 0; i < ngrid; i++) - { - Box bx(regrid_ba[lev-1][i]); - if (lev > lbase) - bl.push_back(bx); - } - if (lev > lbase) - new_grids[lev].define(bl); - } - return; - } - - MakeNewGrids(lbase, time, new_finest, new_grids); - - if (verbose > 0) - { - Real stoptime = ParallelDescriptor::second() - strttime; - -#ifdef BL_LAZY - Lazy::QueueReduction( [=] () mutable { -#endif - ParallelDescriptor::ReduceRealMax(stoptime,ParallelDescriptor::IOProcessorNumber()); - amrex::Print() << "grid_places() time: " << stoptime << " new finest: " << new_finest<< '\n'; -#ifdef BL_LAZY - }); -#endif - } -} - -void -Amr::ErrorEst (int lev, TagBoxArray& tags, Real time, int ngrow) -{ - amr_level[lev]->errorEst(tags,TagBox::CLEAR,TagBox::SET,time, n_error_buf[lev],ngrow); -} - -BoxArray -Amr::GetAreaNotToTag (int lev) -{ - return BoxArray(amr_level[lev]->getAreaNotToTag()); -} - -void -Amr::ManualTagsPlacement (int lev, TagBoxArray& tags, const Vector& bf_lev) -{ - amr_level[lev]->manual_tags_placement(tags, bf_lev); -} - -void -Amr::bldFineLevels (Real strt_time) -{ - BL_PROFILE("Amr::bldFineLevels()"); - finest_level = 0; - - Vector new_grids(max_level+1); - // - // Get initial grid placement. - // - do - { - int new_finest; - - grid_places(finest_level,strt_time,new_finest,new_grids); - - if (new_finest <= finest_level) break; - // - // Create a new level and link with others. - // - finest_level = new_finest; - - DistributionMapping new_dm {new_grids[new_finest]}; - - AmrLevel* level = (*levelbld)(*this, - new_finest, - Geom(new_finest), - new_grids[new_finest], - new_dm, - strt_time); - - amr_level[new_finest].reset(level); - this->SetBoxArray(new_finest, new_grids[new_finest]); - this->SetDistributionMap(new_finest, new_dm); - - amr_level[new_finest]->initData(); - } - while (finest_level < max_level); - // - // Iterate grids to ensure fine grids encompass all interesting gunk. - // but only iterate if we did not provide a grids file. - // - if ( regrid_grids_file.empty() || (strt_time == 0.0 && !initial_grids_file.empty()) ) - { - bool grids_the_same; - - const int MaxCnt = 4; - - int count = 0; - - do - { - for (int i = 0; i <= finest_level; i++) { - new_grids[i] = amr_level[i]->boxArray(); - } - - regrid(0,strt_time,true); - - grids_the_same = true; - - for (int i = 0; i <= finest_level && grids_the_same; i++) { - if (!(new_grids[i] == amr_level[i]->boxArray())) { - grids_the_same = false; - } - } - - count++; - } - while (!grids_the_same && count < MaxCnt); - } -} - -void -Amr::initSubcycle () -{ - BL_PROFILE("Amr::initSubcycle()"); - ParmParse pp("amr"); - sub_cycle = true; - if (pp.contains("nosub")) - { - amrex::Print() << "Warning: The nosub flag has been deprecated.\n " - << "... please use subcycling_mode to control subcycling.\n"; - int nosub; - pp.query("nosub",nosub); - if (nosub > 0) - sub_cycle = false; - else - amrex::Error("nosub <= 0 not allowed.\n"); - subcycling_mode = "None"; - } - else - { - subcycling_mode = "Auto"; - pp.query("subcycling_mode",subcycling_mode); - } - - if (subcycling_mode == "None") - { - sub_cycle = false; - for (int i = 0; i <= max_level; i++) - { - n_cycle[i] = 1; - } - } - else if (subcycling_mode == "Manual") - { - int cnt = pp.countval("subcycling_iterations"); - - if (cnt == 1) - { - // - // Set all values to the single available value. - // - int cycles = 0; - - pp.get("subcycling_iterations",cycles); - - n_cycle[0] = 1; // coarse level is always 1 cycle - for (int i = 1; i <= max_level; i++) - { - n_cycle[i] = cycles; - } - } - else if (cnt > 1) - { - // - // Otherwise we expect a vector of max_grid_size values. - // - pp.getarr("subcycling_iterations",n_cycle,0,max_level+1); - if (n_cycle[0] != 1) - { - amrex::Error("First entry of subcycling_iterations must be 1"); - } - } - else - { - amrex::Error("Must provide a valid subcycling_iterations if mode is Manual"); - } - for (int i = 1; i <= max_level; i++) - { - if (n_cycle[i] > MaxRefRatio(i-1)) - amrex::Error("subcycling iterations must always be <= ref_ratio"); - if (n_cycle[i] <= 0) - amrex::Error("subcycling iterations must always be > 0"); - } - } - else if (subcycling_mode == "Auto") - { - n_cycle[0] = 1; - for (int i = 1; i <= max_level; i++) - { - n_cycle[i] = MaxRefRatio(i-1); - } - } - else if (subcycling_mode == "Optimal") - { - // if subcycling mode is Optimal, n_cycle is set dynamically. - // We'll initialize it to be Auto subcycling. - n_cycle[0] = 1; - for (int i = 1; i <= max_level; i++) - { - n_cycle[i] = MaxRefRatio(i-1); - } - } - else - { - std::string err_message = "Unrecognzied subcycling mode: " + subcycling_mode + "\n"; - amrex::Error(err_message.c_str()); - } -} - -void -Amr::initPltAndChk () -{ - ParmParse pp("amr"); - - pp.query("checkpoint_files_output", checkpoint_files_output); - pp.query("plot_files_output", plot_files_output); - - pp.query("plot_nfiles", plot_nfiles); - pp.query("checkpoint_nfiles", checkpoint_nfiles); - // - // -1 ==> use ParallelDescriptor::NProcs(). - // - if (plot_nfiles == -1) plot_nfiles = ParallelDescriptor::NProcs(); - if (checkpoint_nfiles == -1) checkpoint_nfiles = ParallelDescriptor::NProcs(); - - check_file_root = "chk"; - pp.query("check_file",check_file_root); - - check_int = -1; - pp.query("check_int",check_int); - - check_per = -1.0; - pp.query("check_per",check_per); - - if (check_int > 0 && check_per > 0) - { - if (ParallelDescriptor::IOProcessor()) - amrex::Warning("Warning: both amr.check_int and amr.check_per are > 0."); - } - - plot_file_root = "plt"; - pp.query("plot_file",plot_file_root); - - plot_int = -1; - pp.query("plot_int",plot_int); - - plot_per = -1.0; - pp.query("plot_per",plot_per); - - if (plot_int > 0 && plot_per > 0) - { - if (ParallelDescriptor::IOProcessor()) - amrex::Warning("Warning: both amr.plot_int and amr.plot_per are > 0."); - } - - small_plot_file_root = "smallplt"; - pp.query("small_plot_file",small_plot_file_root); - - small_plot_int = -1; - pp.query("small_plot_int",small_plot_int); - - small_plot_per = -1.0; - pp.query("small_plot_per",small_plot_per); - - if (small_plot_int > 0 && small_plot_per > 0) - { - if (ParallelDescriptor::IOProcessor()) - amrex::Warning("Warning: both amr.small_plot_int and amr.small_plot_per are > 0."); - } - - write_plotfile_with_checkpoint = 1; - pp.query("write_plotfile_with_checkpoint",write_plotfile_with_checkpoint); - - stream_max_tries = 4; - pp.query("stream_max_tries",stream_max_tries); - stream_max_tries = std::max(stream_max_tries, 1); - - abort_on_stream_retry_failure = false; - pp.query("abort_on_stream_retry_failure",abort_on_stream_retry_failure); - - pp.query("precreateDirectories", precreateDirectories); - pp.query("prereadFAHeaders", prereadFAHeaders); - - int phvInt(plot_headerversion), chvInt(checkpoint_headerversion); - pp.query("plot_headerversion", phvInt); - if(phvInt != plot_headerversion) { - plot_headerversion = static_cast (phvInt); - } - pp.query("checkpoint_headerversion", chvInt); - if(chvInt != checkpoint_headerversion) { - checkpoint_headerversion = static_cast (chvInt); - } -} - - -bool -Amr::okToRegrid(int level) -{ - if (regrid_int[level] < 0) - return false; - else - return level_count[level] >= regrid_int[level] && amr_level[level]->okToRegrid(); -} - -Real -Amr::computeOptimalSubcycling(int n, int* best, Real* dt_max, Real* est_work, int* cycle_max) -{ - BL_ASSERT(cycle_max[0] == 1); - // internally these represent the total number of steps at a level, - // not the number of cycles - std::vector cycles(n); - Real best_ratio = 1e200; - Real best_dt = 0; - Real ratio; - Real dt; - Real work; - int limit = 1; - // This provides a memory efficient way to test all candidates - for (int i = 1; i < n; i++) - limit *= cycle_max[i]; - for (int candidate = 0; candidate < limit; candidate++) - { - int temp_cand = candidate; - cycles[0] = 1; - dt = dt_max[0]; - work = est_work[0]; - for (int i = 1; i < n; i++) - { - // grab the relevant "digit" and shift over. - cycles[i] = (1 + temp_cand%cycle_max[i]) * cycles[i-1]; - temp_cand /= cycle_max[i]; - dt = std::min(dt, cycles[i]*dt_max[i]); - work += cycles[i]*est_work[i]; - } - ratio = work/dt; - if (ratio < best_ratio) - { - for (int i = 0; i < n; i++) - best[i] = cycles[i]; - best_ratio = ratio; - best_dt = dt; - } - } - // - // Now we convert best back to n_cycles format - // - for (int i = n-1; i > 0; i--) - best[i] /= best[i-1]; - return best_dt; -} - -const Vector& Amr::getInitialBA() -{ - return initial_ba; -} - -#ifdef AMREX_PARTICLES -void -Amr::RedistributeParticles () -{ - amr_level[0]->particle_redistribute(0,true); -} -#endif - -} - diff --git a/Src/AmrTask/Amr/Make.package b/Src/AmrTask/Amr/Make.package deleted file mode 100644 index 38448c44473..00000000000 --- a/Src/AmrTask/Amr/Make.package +++ /dev/null @@ -1,9 +0,0 @@ -AMRLIB_BASE=EXE - -C$(AMRLIB_BASE)_sources += AMReX_AmrAsync.cpp AMReX_AmrLevelAsync.cpp - -C$(AMRLIB_BASE)_headers += AMReX_Amr.H AMReX_AmrLevel.H AMReX_AmrLevelAsync.H AMReX_Derive.H AMReX_LevelBld.H AMReX_StateData.H \ - AMReX_StateDescriptor.H AMReX_PROB_AMR_F.H AMReX_AuxBoundaryData.H AMReX_Extrapolater.H - -VPATH_LOCATIONS += $(AMREX_HOME)/Src/Amr $(AMREX_HOME)/Src/AmrTask/Amr -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Amr $(AMREX_HOME)/Src/AmrTask/Amr diff --git a/Src/AmrTask/Amr/Makefile b/Src/AmrTask/Amr/Makefile deleted file mode 100755 index 17c8d199fdb..00000000000 --- a/Src/AmrTask/Amr/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -include ../arch.common - -OBJECTS= AMReX_AmrTask.o - -AMRLIB= AMRTask.a - -all: $(AMRLIB) - -$(AMRLIB): $(OBJECTS) - ar rv $(AMRLIB) $(OBJECTS) - - -INCLUDE += -DBL_USE_MPI -DBL_USE_OMP -DBL_SPACEDIM=3 -DAMREX_SPACEDIM=3 -DBL_FORT_USE_UNDERSCORE -DBL_Linux - -AMReX_AmrTask.o: AMReX_AmrTask.cpp - $(C++) $(C++FLAGS) -I./ -I../../Base -I../../Amr -I../../AmrCore -I../../Boundary -I../graph -I$(INCLUDE) -c AMReX_AmrTask.cpp -o AMReX_AmrTask.o - -.PHONY: clean - -clean: - $(RM) $(OBJECTS) - $(RM) *.a - diff --git a/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.H b/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.H deleted file mode 100644 index 767e668257a..00000000000 --- a/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.H +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef AMREX_FillPatchUtil_H_ -#define AMREX_FillPatchUtil_H_ - -#include -#include -#include -#include -#include - -namespace amrex -{ - void FillPatchSingleLevel (MultiFab& mf, Real time, - const Vector& smf, const Vector& stime, - int scomp, int dcomp, int ncomp, - const Geometry& geom, PhysBCFunctBase& physbcf); - - void FillPatchTwoLevels (MultiFab& mf, Real time, - const Vector& cmf, const Vector& ct, - const Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const BCRec& bcs); - - void FillPatchTwoLevels (MultiFab& mf, Real time, - const Vector& cmf, const Vector& ct, - const Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs); -} - -#endif diff --git a/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.cpp b/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.cpp deleted file mode 100644 index 39d3fb74b4c..00000000000 --- a/Src/AmrTask/AmrCore/AMReX_FillPatchUtil.cpp +++ /dev/null @@ -1,176 +0,0 @@ -#include -#include -#include -#include - -#ifdef AMREX_USE_EB -#include -#endif - -#ifdef _OPENMP -#include -#endif - -namespace amrex -{ - void FillPatchSingleLevel (MultiFab& mf, Real time, - const Vector& smf, const Vector& stime, - int scomp, int dcomp, int ncomp, - const Geometry& geom, PhysBCFunctBase& physbcf) - { - BL_PROFILE("FillPatchSingleLevel"); - - BL_ASSERT(scomp+ncomp <= smf[0]->nComp()); - BL_ASSERT(dcomp+ncomp <= mf.nComp()); - BL_ASSERT(smf.size() == stime.size()); - BL_ASSERT(smf.size() != 0); - - if (smf.size() == 1) - { - mf.copy(*smf[0], scomp, dcomp, ncomp, 0, mf.nGrow(), geom.periodicity()); - } - else if (smf.size() == 2) - { - BL_ASSERT(smf[0]->boxArray() == smf[1]->boxArray()); - MultiFab raii; - MultiFab * dmf; - int destcomp; - bool sameba; - if (mf.boxArray() == smf[0]->boxArray()) { - dmf = &mf; - destcomp = dcomp; - sameba = true; - } else { - raii.define(smf[0]->boxArray(), smf[0]->DistributionMap(), ncomp, 0, - MFInfo(), smf[0]->Factory()); - - dmf = &raii; - destcomp = 0; - sameba = false; - } - -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(*dmf,true); mfi.isValid(); ++mfi) - { - const Box& bx = mfi.tilebox(); - (*dmf)[mfi].linInterp((*smf[0])[mfi], - scomp, - (*smf[1])[mfi], - scomp, - stime[0], - stime[1], - time, - bx, - destcomp, - ncomp); - } - - if (sameba) - { - // Note that when sameba is true mf's BoxArray is nonoverlapping. - // So FillBoundary is safe. - mf.FillBoundary(dcomp,ncomp,geom.periodicity()); - } - else - { - int src_ngrow = 0; - int dst_ngrow = mf.nGrow(); - - mf.copy(*dmf, 0, dcomp, ncomp, src_ngrow, dst_ngrow, geom.periodicity()); - } - } - else { - amrex::Abort("FillPatchSingleLevel: high-order interpolation in time not implemented yet"); - } - - physbcf.FillBoundary(mf, dcomp, ncomp, time); - } - - void FillPatchTwoLevels (MultiFab& mf, Real time, - const Vector& cmf, const Vector& ct, - const Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const BCRec& bcs) - { - Vector bcs_array(1,BCRec(bcs.lo(),bcs.hi())); - - FillPatchTwoLevels(mf,time,cmf,ct,fmf,ft,scomp,dcomp,ncomp,cgeom,fgeom, - cbc,fbc,ratio,mapper,bcs_array); - } - - - void FillPatchTwoLevels (MultiFab& mf, Real time, - const Vector& cmf, const Vector& ct, - const Vector& fmf, const Vector& ft, - int scomp, int dcomp, int ncomp, - const Geometry& cgeom, const Geometry& fgeom, - PhysBCFunctBase& cbc, PhysBCFunctBase& fbc, - const IntVect& ratio, - Interpolater* mapper, const Vector& bcs) - { - BL_PROFILE("FillPatchTwoLevels"); - - int ngrow = mf.nGrow(); - - if (ngrow > 0 || mf.getBDKey() != fmf[0]->getBDKey()) - { - const InterpolaterBoxCoarsener& coarsener = mapper->BoxCoarsener(ratio); - - Box fdomain = fgeom.Domain(); - fdomain.convert(mf.boxArray().ixType()); - Box fdomain_g(fdomain); - for (int i = 0; i < BL_SPACEDIM; ++i) { - if (fgeom.isPeriodic(i)) { - fdomain_g.grow(i,ngrow); - } - } - - const FabArrayBase::FPinfo& fpc = FabArrayBase::TheFPinfo(*fmf[0], mf, fdomain_g, - IntVect(ngrow), coarsener, - amrex::coarsen(fgeom.Domain(),ratio)); - - if ( ! fpc.ba_crse_patch.empty()) - { - MultiFab mf_crse_patch(fpc.ba_crse_patch, fpc.dm_crse_patch, ncomp, 0, MFInfo(), - *fpc.fact_crse_patch); - - FillPatchSingleLevel(mf_crse_patch, time, cmf, ct, scomp, 0, ncomp, cgeom, cbc); - - int idummy1=0, idummy2=0; - bool cc = fpc.ba_crse_patch.ixType().cellCentered(); - ignore_unused(cc); -#ifdef _OPENMP -#pragma omp parallel if (cc) -#endif - for (MFIter mfi(mf_crse_patch); mfi.isValid(); ++mfi) - { - int li = mfi.LocalIndex(); - int gi = fpc.dst_idxs[li]; - const Box& dbx = fpc.dst_boxes[li]; - - Vector bcr(ncomp); - amrex::setBC(dbx,fdomain,scomp,0,ncomp,bcs,bcr); - - mapper->interp(mf_crse_patch[mfi], - 0, - mf[gi], - dcomp, - ncomp, - dbx, - ratio, - cgeom, - fgeom, - bcr, - idummy1, idummy2, RunOn::Cpu); - } - } - } - - FillPatchSingleLevel(mf, time, fmf, ft, scomp, dcomp, ncomp, fgeom, fbc); - } -}//namespace diff --git a/Src/AmrTask/Makefile b/Src/AmrTask/Makefile deleted file mode 100644 index 5bdc7b13c56..00000000000 --- a/Src/AmrTask/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -include ./arch.common - -SUBDIRS = graph $(RTS_DIR) Amr tutorials/UnitTests/ AMFIter tutorials/MiniApps/HeatEquation - -.PHONY: build $(SUBDIRS) - -build: $(SUBDIRS) - -$(SUBDIRS): - $(MAKE) -C $@ - -.PHONY: all - -all: build - -clean: - $(foreach dir, $(SUBDIRS), $(MAKE) -C $(dir) clean;) diff --git a/Src/AmrTask/arch.common b/Src/AmrTask/arch.common deleted file mode 100644 index 940944a3fc6..00000000000 --- a/Src/AmrTask/arch.common +++ /dev/null @@ -1,4 +0,0 @@ -ROOT_PATH= /home/users/nnguyent/lbl/amrex/Src/AmrTask - -include $(ROOT_PATH)/arch/arch.mpi.generic -#include $(ROOT_PATH)/arch/arch.serial diff --git a/Src/AmrTask/arch/arch.mpi.generic b/Src/AmrTask/arch/arch.mpi.generic deleted file mode 100755 index 6772e25989c..00000000000 --- a/Src/AmrTask/arch/arch.mpi.generic +++ /dev/null @@ -1,64 +0,0 @@ -RM = rm -f -LN = ln -s -ECHO = echo - -C++ = mpicxx -CC = mpicc - -C++LINK = $(C++) -CLINK = $(C++) - -COPTIMIZATION = -O3 - -C++FLAGS += -std=c++11 $(COPTIMIZATION) -fopenmp -openmp #$(DEBUG) - -LDFLAGS += $(C++FLAGS) -LDLIBS = -lpthread - -RTS_DIR = $(ROOT_PATH)/rts_impls/MPI_Generic/ -INCLUDE = $(RTS_DIR) - -SEGSIZE = -DSEGMENT_SIZE=2147483648 - -######################################################################### -# End of the System dependent prefix -######################################################################### - - -######################################################################### -# # -# Suffixes for compiling most normal C++, C files # -# # -######################################################################### - -.SUFFIXES: -.SUFFIXES: .C .cxx .c .cpp .o - -.C.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.cxx.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.cpp.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.c.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(CC) $(C++FLAGS) -c $< - @$(ECHO) - diff --git a/Src/AmrTask/arch/arch.serial b/Src/AmrTask/arch/arch.serial deleted file mode 100755 index ab28167c033..00000000000 --- a/Src/AmrTask/arch/arch.serial +++ /dev/null @@ -1,63 +0,0 @@ -RM = rm -f -LN = ln -s -ECHO = echo - -C++ = g++ -CC = gcc - -C++LINK = $(C++) -CLINK = $(C++) - -COPTIMIZATION = -O3 - -C++FLAGS += -std=c++11 $(COPTIMIZATION) $(DEBUG) - -LDFLAGS += $(C++FLAGS) -LDLIBS = - -RTS_DIR = $(ROOT_PATH)/rts_impls/Serial/ -INCLUDE = $(RTS_DIR) - - -######################################################################### -# End of the System dependent prefix -######################################################################### - - -######################################################################### -# # -# Suffixes for compiling most normal C++, C files # -# # -######################################################################### - -.SUFFIXES: -.SUFFIXES: .C .cxx .c .cpp .o - -.C.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.cxx.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.cpp.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(C++) $(C++FLAGS) -c $< - @$(ECHO) - -.c.o: - @$(ECHO) - @$(ECHO) "Compiling Source File --" $< - @$(ECHO) "---------------------" - $(CC) $(C++FLAGS) -c $< - @$(ECHO) - diff --git a/Src/AmrTask/graph/AMReX_AbstractTask.H b/Src/AmrTask/graph/AMReX_AbstractTask.H deleted file mode 100644 index b191781d5b2..00000000000 --- a/Src/AmrTask/graph/AMReX_AbstractTask.H +++ /dev/null @@ -1,274 +0,0 @@ -#ifndef AMREX_ABSTRACT_TASK -#define AMREX_ABSTRACT_TASK -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 08-07-2017 - -#include "AMReX_DataTypes.H" -#include -#include -#include -#include -#include -#include -#include -#include -#include "rts_taskimpl.H" -using namespace std; - -namespace amrex{ - - class TaskName{ - protected: - std::vector _ids; - public: - TaskName(){} - TaskName(int id){_ids.push_back(id);} - TaskName(int id0, int id1){_ids.push_back(id0); _ids.push_back(id1);} - TaskName(int id0, int id1, int id2){_ids.push_back(id0); _ids.push_back(id1); _ids.push_back(id2);} - TaskName(int id0, int id1, int id2, int id3){_ids.push_back(id0); _ids.push_back(id1); _ids.push_back(id2);_ids.push_back(id3);} - int& operator[](int i){return _ids[i];} - const int& operator[](int i) const{return _ids[i];} - TaskName& operator=(const TaskName &rhs){ - assert(_ids.size()==0 || _ids.size()== rhs.Dim()); - if(_ids.size()==0) - for(int i=0; i< rhs.Dim(); i++) - _ids.push_back(rhs[i]); - else //already have the same dim - for(int i=0; i< _ids.size(); i++) - _ids[i] = rhs[i]; - return *this; - } - bool operator== (const TaskName &rhs) const{ - if(_ids.size() != rhs.Dim()) return false; - for(int i=0; i< _ids.size(); i++) - if(_ids[i] != rhs[i]) return false; - return true; - } - bool operator!= (const TaskName &rhs) const{ - if(_ids.size() != rhs.Dim()) return true; - for(int i=0; i< _ids.size(); i++) - if(_ids[i] != rhs[i]) return true; - return false; - } - bool operator< (const TaskName &rhs) const{ - if(_ids.size() < rhs.Dim()) return true; - if(_ids.size() > rhs.Dim()) return false; - for(int i= _ids.size()-1; i>=0; i--){ - if(_ids[i] > rhs[i]) return false; - if(_ids[i] < rhs[i]) return true; - } - return false; - } - void SetSize(int dim){ - for(int i=0; i > > _dataMap; - size_t _size; - - public: - //return the total number of inputs/outputs of a task - size_t size(){ - size_t s=0; - std::map > >::iterator it= _dataMap.begin(); - while (it!= _dataMap.end()){ - std::map >::iterator tagIt= ((*it).second).begin(); - while (tagIt!= ((*it).second).end()){ - s+= (*tagIt).second.size(); - tagIt++; - } - it++; - } - return s; - } - bool isSatisfied(TaskName name, int tag=0){ - if(_dataMap.find(name) != _dataMap.end()){ - if(_dataMap[name].find(tag) != _dataMap[name].end()){ - return _dataMap[name][tag].size() >0; - } - } - return false; - } - bool empty(){return _size==0;} - void push_back(TaskName name, Data* d, int tag=0){ - _dataMap[name][tag].push(d); - _size++; - } - Data* pop_front(TaskName name, int tag=0){ - if(_dataMap[name][tag].size()==0) return NULL; - Data* d= _dataMap[name][tag].front(); - _dataMap[name][tag].pop(); - _size--; - return d; - } - std::queue< Data* >& GetDependencies(TaskName name, int tag=0){return _dataMap[name][tag];} - }; - - //! The most abstract task - class Task{ - protected: - TaskName _id; - DependencyMap _neighbors_in; - std::queue _outputs; - std::queue _newTasks; - bool _isPersistent; - bool _isMasterTask; - public: - Task():_isPersistent(true),_isMasterTask(false){} - Task(TaskName name):_isPersistent(true),_isMasterTask(false){_id= name;} - //Describe Data Dependency - virtual bool Dependency()=0; - //! What the task is supposed to do - virtual void Job()=0; - //! Once the task finished its computation, any actions should be taken (like create new taks)? - virtual void PostCompletion()=0; - TaskName MyName(){return _id;} - void SetName(TaskName id){ _id=id;} - void SetMaster(){_isMasterTask=true;} - - bool TestDependencies(){return Dependency();} - void RunJob(){Job();} - void RunPostCompletion(){PostCompletion();} - void Pull(TaskName src, char* d, size_t size, int tag=0); - void Push(TaskName dest, char* d, size_t size, int tag=0); - bool Depend_on(TaskName src, int tag=0){ - return _neighbors_in.isSatisfied(src, tag); - } - std::queue& GetOutputs(){return _outputs;} - std::queue& GetNewTasks(){return _newTasks;} - DependencyMap& GetInputs(){return _neighbors_in;} - void KeepTaskAlive(){_isPersistent=true;} - void SelfDestroy(){_isPersistent=false;} - bool isPersistent(){return _isPersistent;} - bool isMasterTask(){return _isMasterTask;} - - template void LocalAtomicAdd(T *addr, T val){ - LocalAtomicAdd_impl(addr, val); - } - template void GlobalAtomicAdd(T *addr, T val){ - GlobalAtomicAdd_impl(addr, val); - } - void barrierTask(){ - //BarrierTask_impl(); - } - void RegisterTask(Task* t){ - _newTasks.push(t); - } - }; - -#if 0 - //! This task is created to do LOCAL jobs (thus it is not migratable), when all data dependencies have been satisfied - class NonMigratableTask :public Task{ - protected: - TaskState st; - //! A task may depend on multiple pieces of data from another task - //! A task may send multiple pieces of data to another task - //DependencyMap neighbors_out; - public: - }; - - - //! This task can be migrated to a remote process (e.g. to be closer to data). - class MigratableTask :public Task{ - protected: - int origin;//original process - public: - virtual void pullData(); - }; - - - - - enum state_t{FRESH /*just created*/, PENDING/*waiting for data*/, READY/*data dependencies satisfied*/, RUNNING /*scheduled to run*/, FINISHED /*no more work to do*/, ERROR}; - class TaskState{ - private: - state_t _st; - - public: - TaskState():_st(PENDING){} - TaskState(state_t st):_st(st){} - void shift(); - void shift(state_t newState); - }; -#endif - - - - -}//end namespace - -#endif diff --git a/Src/AmrTask/graph/AMReX_AbstractTask.cpp b/Src/AmrTask/graph/AMReX_AbstractTask.cpp deleted file mode 100644 index ab5983ce73e..00000000000 --- a/Src/AmrTask/graph/AMReX_AbstractTask.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-24-2017 - -namespace amrex{ - void Task::Pull(TaskName src, char* d, size_t size, int tag){ - Data* data= _neighbors_in.pop_front(src, tag); - memcpy(d, data->GetBuffer(), size); - data->Free(); - } - void Task::Push(TaskName dest, char* d, size_t size, int tag){ - Data* data= new Data(_id, dest, size); - data->SetTag(tag); - memcpy(data->GetBuffer(), d, size); - _outputs.push(data); - } -} diff --git a/Src/AmrTask/graph/AMReX_Affinity.H b/Src/AmrTask/graph/AMReX_Affinity.H deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Src/AmrTask/graph/AMReX_Affinity.cpp b/Src/AmrTask/graph/AMReX_Affinity.cpp deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Src/AmrTask/graph/AMReX_DataTypes.H b/Src/AmrTask/graph/AMReX_DataTypes.H deleted file mode 100644 index 2d1ec100158..00000000000 --- a/Src/AmrTask/graph/AMReX_DataTypes.H +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef AMREX_PRIMITIVE_TYPES -#define AMREX_PRIMITIVE_TYPES -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-24-2017 - -#include -#include -#include -#include -#include -#include -#include -using namespace std; - -namespace amrex{ - //!This class is similar to IntVect, but it supports template - template - class PointVect{ - private: - int _vect[D]; - public: - class shift_hasher{ - private: - static const unsigned shift=0; - static const unsigned shift_stride=0; - shift_hasher(){ - shift=shift_stride= 8*sizeof(size_t)/D; - } - public: - size_t operator()(const PointVect& vec) const - { - size_t ret=_vect[0]; - for(int i=1; i& set(int i, int val){ - _vect[i]=val; - return *this; - } - PointVect& operator= (const PointVect& rhs) const{ - for(int i=0; i& operator= (const PointVect& rhs){ - for(int i=0; i& operator= (const int val) const{ - for(int i=0; i=0; i++){ - if(_vect[i]>rhs[i]) return false; - if(_vect[i]=0; i++){ - if(_vect[i]>rhs[i]) return false; - if(_vect[i] (const PointVect& rhs) const{ - for(int i=D-1; i>=0; i++){ - if(_vect[i]rhs[i]) return true; - } - return false; - } - bool operator>= (const PointVect& rhs) const{ - for(int i=D-1; i>=0; i++){ - if(_vect[i]rhs[i]) return true; - } - return true; - } - }; - - /* - template - class TaskTable{ - private: - std::unorderedMap, std::list, PointVect::shift_hasher> - public: - }; - */ -} - -#endif diff --git a/Src/AmrTask/graph/AMReX_TaskGraph.H b/Src/AmrTask/graph/AMReX_TaskGraph.H deleted file mode 100644 index 8095754dbe8..00000000000 --- a/Src/AmrTask/graph/AMReX_TaskGraph.H +++ /dev/null @@ -1,339 +0,0 @@ -#ifndef AMREX_ABSTRACT_TASKGRAPH -#define AMREX_ABSTRACT_TASKGRAPH -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-24-2017 - -#include -#include -#include -#include -#include -#include -#include -#include "AMReX_AbstractTask.H" -using namespace std; - -namespace amrex{ - - template - class BlockMapping{ - private: - PointVect _first, _last, _range, _size; - size_t _linearSize; - size_t _totalSize; - - public: - BlockMapping(PointVect gSize, PointVect myProc, PointVect nProcs){ - for(int i=0; i gSize, int myProc, int nProcs){ - size_t block; - size_t remainder; - size_t totalSize=gSize[0]; - for(int i=1; i=0; i--){ - subSize= subSize/gSize[i]; - _last[i]=last_linear/subSize; - _first[i]=first_linear/subSize; - _range[i]= _last[i]-_first[i] +1; - _size[i]= _range[i]; - last_linear= last_linear%subSize; - first_linear= first_linear%subSize; - } - } - - PointVect first(){return _first;} - PointVect last(){return _last;} - PointVect range(){return _range;} - PointVect size(){return _size;} - size_t linearSize(){return _linearSize;} - size_t totalSize(){return _totalSize;} - }; - - enum RunningMode{ - _Push=0,//The task will not be scheduled until all data dependencies are satified. Data are pushed by other tasks without its consent. - _Pull//The task will be fetched and it will pull dependencies from other tasks - }; - - class SelfAssociate{ - public: - TaskName TaskAssociate(TaskName name){ - return name; - } - }; - - //!This class defines data and task spaces, as well as options to decompose these spaces - template - class AbstractTaskGraph{ - struct TaskComp { - bool operator() (const TaskName& lhs, const TaskName& rhs) const - { - return lhs _initialTasks; - std::map _taskPool; - //! A process can iterate over tasks that it owns. However, how application tasks are mapped to processes will be defined at higher level classes. - Task *_begin, *_end, *_current; - typename std::vector::iterator _currIt; - size_t _nLocalTasks; - RunningMode _mode; - - public: - AbstractTaskGraph(string graphName=""): _graphName(graphName), _rank(0), _nProcs(1){ - _begin=NULL; - _end=NULL; - _current=NULL; - _nLocalTasks=0; - _mode= _Push; - } - void DestroyGraph(){ - for(typename std::vector::iterator it= _initialTasks.begin(); it!= _initialTasks.end(); it++){ - delete (*it); - } - _initialTasks.clear(); - _taskPool.clear(); - } - void DestroyTask(Task* t){ - _taskPool.erase(t->MyName()); - } - int MyProc(){return _rank;} - int ProcCount(){return _nProcs;} - RunningMode GetRunningMode(){return _mode;} - string GetTaskName(){return _graphName;} - void GraphSynchronize(); - std::map &GetTaskPool(){return _taskPool;} - Task* LocateTask(TaskName name){ - if(_taskPool.find(name)!= _taskPool.end()) - return _taskPool[name]; - return NULL; - } - virtual int FindProcessAssociation(TaskName name){ //maps task name to process rank - } - //!First element stored in the process - Task* Begin(){ - return _begin; - } - //!Last element stored in the process - Task* End(){ - return _end; - } - //! The next element - Task* Next(){ - _currIt++; - _current= *_currIt; - return _current; - } - //! The current element - Task* Current(){ - return _current; - } - }; - - - /** - * \brief This class is useful when we need a task graph to partition an irregular geometry. - * Tasks are created from a list of names (for example, box/data tile names), which can have multiple dimensions. - * These tasks are stored in a distributed hash map and can be accessed in constant time. - * A task can create another task dynamically and set it location (i.e., which process owns the newly created task). - * The default location is that of the the parent task. - */ - template - class ArrayGraph: public AbstractTaskGraph{ - protected: - string _graphName; - BlockMapping *_taskMap; - PointVect _graphSize; - - public: - //! Create a 1D Task Graph - ArrayGraph(string graphName, int graphSize, int rank, int nProcs): _graphName(graphName), _graphSize(graphSize){ - assert(D==1); - AbstractTaskGraph::_nProcs= nProcs; - AbstractTaskGraph::_rank= rank; - _taskMap= new BlockMapping<1>(PointVect<1>(graphSize), PointVect<1>(rank), PointVect<1>(nProcs)); - for(int i=_taskMap->first()[0], idx=0; i<=_taskMap->last()[0]; i++, idx++){ - TaskName name(i); - T *t= new T(); - t->SetName(name); - AbstractTaskGraph::_initialTasks.push_back(t); - AbstractTaskGraph::_taskPool[name]= t; - } - AbstractTaskGraph::_begin= *(AbstractTaskGraph::_initialTasks.begin()); - AbstractTaskGraph::_end= *(AbstractTaskGraph::_initialTasks.end()); - AbstractTaskGraph::_currIt= AbstractTaskGraph::_initialTasks.begin(); - AbstractTaskGraph::_current= *(AbstractTaskGraph::_currIt); - AbstractTaskGraph::_mode= _Push; - } - //! Create a multidimensional graph and LINEARLY map it to processors - ArrayGraph(string graphName, PointVect graphSize, int rank, int nProcs): _graphName(graphName){ - _graphSize= graphSize; - AbstractTaskGraph::_nProcs= nProcs; - AbstractTaskGraph::_rank= rank; - _taskMap= new BlockMapping(graphSize, rank, nProcs); - PointVect p=_taskMap->first(); - for(int i=0; i<_taskMap->linearSize(); i++){ - TaskName name; - name.SetSize(D); - for(int j=0; jSetName(name); - AbstractTaskGraph::_initialTasks.push_back(t); - AbstractTaskGraph::_taskPool[name]= t; - for(int d=0; d::_begin= *(AbstractTaskGraph::_initialTasks.begin()); - AbstractTaskGraph::_end= *(AbstractTaskGraph::_initialTasks.end()); - AbstractTaskGraph::_currIt= AbstractTaskGraph::_initialTasks.begin(); - AbstractTaskGraph::_current= *(AbstractTaskGraph::_currIt); - AbstractTaskGraph::_mode= _Push; - } - int FindProcessAssociation(TaskName name){ - A associate; - TaskName n= associate.TaskAssociate(name); - size_t block_size= _taskMap->totalSize()/AbstractTaskGraph::ProcCount(); - size_t val=n[0]; - size_t stride=1; - for(int d=1; d* GetTaskMap(){return &_taskMap;} - void Destroy(){ - for(typename std::vector::iterator it= AbstractTaskGraph::_initialTasks.begin(); it!= AbstractTaskGraph::_initialTasks.end(); it++){ - delete (*it); - } - AbstractTaskGraph::_initialTasks.clear(); - AbstractTaskGraph::_taskPool.clear(); - delete _taskMap; - } - }; - -#if 0 - //! This class should be used when we want to create a rectangular task space (data and/or time). - template - class CartesianGraph: public AbstractTaskGraph{ - protected: - string _graphName; - BlockMapping* _taskMap; - - public: - //create a multi-dimensional taskGraph using P processes - CartesianGraph(string graphName="", PointVect graphSize, PointVect Prank, PointVect Psize){ - _taskMap= new BlockMapping(graphSize, Prank, Psize); - if(_taskMap.size() !=0){ - TaskName name(_taskMap.first()[0]); - size_t nTasks= _taskMap.size()[0]; - for (int d=1; dfirst()[d], idx=0; i<=_taskMap->last()[d]; i++, idx++){ - T *t= new T(); - t->SetName(name); - AbstractTaskGraph::_initialTasks.push_back(t); - AbstractTaskGraph::_taskPool[name]= t; - } - } - } - - - } - BlockMapping* GetTaskMap(){return &_taskMap;} - }; -#endif - - -#if 0 - - //Data are distributed linearly, though a multi-dimensional key (e.g. task name) is required to locate them - template - class LinearMapping::public AbstractMapping{ - private: - std::unorderedMap, std::list, PointVect::shift_hasher> _data; - public: - LinearMapping(std::vector< PointVect > nameVect){ - _begin[0]= nameVect[0]; - _end[0]= nameVect[nameVect.size()-1]; - _size[0]= nameVect.size(); - } - }; - - // - - template - class CyclicMapping: public AbstractMapping{ - private: - - public: - CyclicMapping(PointVect gSize, PointVect myProc, PointVect nProcs){ - for(int i=0; i - class BlockyCylicMapping{ - private: - CyclicMapping(PointVect gSize, PointVect blockSize, PointVect myProc, PointVect nProcs){ - for(int i=0; i -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-24-2017 - -namespace amrex{ - - -} diff --git a/Src/AmrTask/graph/Makefile b/Src/AmrTask/graph/Makefile deleted file mode 100755 index 82dddfef0d6..00000000000 --- a/Src/AmrTask/graph/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -include ../arch.common - -GRAPH_LIB= graph.a - -OBJECTS= AMReX_AbstractTask.o AMReX_TaskGraph.o - -all: $(GRAPH_LIB) - -$(GRAPH_LIB): $(OBJECTS) - ar rv $(GRAPH_LIB) $(OBJECTS) - -$(OBJECTS): AMReX_AbstractTask.H AMReX_AbstractTask.cpp AMReX_TaskGraph.H AMReX_TaskGraph.cpp - -AMReX_AbstractTask.o: AMReX_AbstractTask.cpp AMReX_AbstractTask.H - $(C++) $(C++FLAGS) -I. -I$(INCLUDE) -c AMReX_AbstractTask.cpp -o AMReX_AbstractTask.o - -AMReX_TaskGraph.o: AMReX_TaskGraph.cpp AMReX_TaskGraph.H - $(C++) $(C++FLAGS) -I./ -I$(INCLUDE) -c AMReX_TaskGraph.cpp -o AMReX_TaskGraph.o - -.PHONY: clean - -clean: - $(RM) $(OBJECTS) - $(RM) graph.a diff --git a/Src/AmrTask/graph/RTS.H b/Src/AmrTask/graph/RTS.H deleted file mode 100644 index 5e230a86827..00000000000 --- a/Src/AmrTask/graph/RTS.H +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef _RTS -#define _RTS -#include "AMReX_AbstractTask.H" -#include "AMReX_TaskGraph.H" -#include -#include -#include "rts_graphimpl.H" -#include - -using namespace std; - -namespace amrex{ - struct _workerThreadInfo{ - int _tid; //thread id in local group - int _size; //number of threads in the group - }; - - struct _threadInfo{ - bool _isComm; //whether this thread handles communication - int _wtid; //worker thread id (-1 if this thread is decicated to communication) - int _nWts; //number of thread groups - }; - - class RTS{ - private: - int _nWrks; - void RTS_Init(); - int _rank, _nProcs; - - public: - RTS(){ - _nWrks=1; - char* nWrks= getenv("NWORKERS"); - if(nWrks) _nWrks= atoi(nWrks); - } - RTS(int nWrks):_nWrks(nWrks){} - int ProcCount(); - int MyProc(); - int WorkerThreadCount(); - int MyWorkerThread(); - void Init(); //Build the runtime system from scratch - void Init(int rank, int nProcs);//Build the runtime system on pre-existing MPI processes - void Iterate(void *graph); - void Finalize(); - double Time(); - void Barrier(); - template void ReductionSum(T *local, T *global, int length, int root){ - ReductionSum_impl(local, global, length, root); - } - }; -} -#endif diff --git a/Src/AmrTask/make_defaults/Cori b/Src/AmrTask/make_defaults/Cori deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Src/AmrTask/make_defaults/Edison b/Src/AmrTask/make_defaults/Edison deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Src/AmrTask/make_defaults/Summit-dev b/Src/AmrTask/make_defaults/Summit-dev deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/Src/AmrTask/rts_impls/MPI_Generic/Makefile b/Src/AmrTask/rts_impls/MPI_Generic/Makefile deleted file mode 100755 index 050bbab1718..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../arch.common - -RTS_LIB= rts.a - -OBJECTS= rts.o sysInfo.o dl_malloc.o - -all: $(RTS_LIB) - -$(RTS_LIB): $(OBJECTS) - ar rv $(RTS_LIB) $(OBJECTS) - -#$(OBJECTS): rts.C - -rts.o: rts.C - $(C++) $(C++FLAGS) $(SEGSIZE) -DONLY_MSPACES=1 -I. -I../Utils/ -I$(INCLUDE) -I../../graph -c rts.C -o rts.o - -sysInfo.o: ../Utils/sysInfo.C - $(C++) $(C++FLAGS) -I../Utils/ -I$(INCLUDE) -c ../Utils/sysInfo.C -o sysInfo.o - -dl_malloc.o: - $(CC) -DONLY_MSPACES=1 -I../Utils/ -I$(INCLUDE) -O2 -c ../Utils/dl_malloc.c -o dl_malloc.o - -.PHONY: clean - -clean: - $(RM) $(OBJECTS) - $(RM) *.a diff --git a/Src/AmrTask/rts_impls/MPI_Generic/README b/Src/AmrTask/rts_impls/MPI_Generic/README deleted file mode 100644 index 9577b83c562..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/README +++ /dev/null @@ -1,17 +0,0 @@ -This is a runtime version that employs 2-sided MPI (i.e. MPI-1) to implement fundamental routines required to schedule an AMReX task dependency graph. -The runtime comprises a set of MPI processes, each consisting of multiple WORKER threads. -The runtime can be configured to run with 1 process per compute node (i), per NUMA node (ii), or per core (iii). -For cases (i) and (ii), there can be multiple WORKER threads per process (one WORKER thread per NUMA node or per core). -Also, the runtime can dedicate one or a few cores per compute node to handle communication in a responsive fashion. -For case (ii), WORKER thread and Communication HANDLER thread share the same core. - -Multiple worker threads may share a single task queue (for load balancing purpose). -Each worker thread also has a private queue serving as a task buffer, allowing scheduling latency and lock/unlock cost to be reduced. - -Note: one of the primary goals of this runtime implementation is PORTABILITY. - -Thus, there is no special assumption about MPI mode to be made. -For example, the runtime should run correctly whether MPI supports MPI_THREAD_FUNNELED (common scenario) or MPI_THREAD_MULTIPLE (not so common) mode. - -Also, we use Pthreads to implement WORKER threads. -At the application level, the programmer can use OpenMP to parallelize each task. diff --git a/Src/AmrTask/rts_impls/MPI_Generic/mylock.h b/Src/AmrTask/rts_impls/MPI_Generic/mylock.h deleted file mode 100644 index bddb8ed6970..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/mylock.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef MYLOCK -#define MYLOCK - -#include - -class MyLock -{ - private: - pthread_mutex_t _lock; - - public: - MyLock(){ - pthread_mutex_init(&_lock, NULL); - } - ~MyLock(){ - pthread_mutex_destroy(&_lock); - } - void lock() - { - pthread_mutex_lock(&_lock); - } - void unlock() - { - pthread_mutex_unlock(&_lock); - } -}; -#endif diff --git a/Src/AmrTask/rts_impls/MPI_Generic/rts.C b/Src/AmrTask/rts_impls/MPI_Generic/rts.C deleted file mode 100644 index 6dddd3b12d5..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/rts.C +++ /dev/null @@ -1,593 +0,0 @@ -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 08-14-2017 -#include "AMReX_AbstractTask.H" -#include "AMReX_TaskGraph.H" -#include "RTS.H" -#include -#include -#include -#include -#include "sysInfo.H" -#include "mylock.h" -#include - -#include -#include -using namespace std; -#include - -namespace amrex{ - //we don't use template for task and message queuese since in the future we may implement them in different ways - class _TaskQueue { - private: - std::queue _queue; - MyLock _lock; - bool _NoLoad; //queue is empty and no onflight task - public: - _TaskQueue():_NoLoad(true){} - void push(Task* t){ - _lock.lock(); - _queue.push(t); - _NoLoad=false; - _lock.unlock(); - } - Task* pop(){ - _lock.lock(); - if(_queue.size()>0) { - Task*t = _queue.front(); - _queue.pop(); - _lock.unlock(); - return t; - } - _lock.unlock(); - return NULL; - } - Task* front(){ - _lock.lock(); - if(_queue.size()>0) { - Task*t = _queue.front(); - return t; - } - _lock.unlock(); - return NULL; - } - void SetNoLoad(){ - _lock.lock(); - if(_queue.size()==0)_NoLoad=true; - _lock.unlock(); - } - bool NoLoad(){return _NoLoad;} - size_t size(){ return _queue.size();} - }; - - class _MessageQueue{ - private: - std::queue _queue; - MyLock _lock; - public: - void push(Data* &d){ - _lock.lock(); - _queue.push(d); - _lock.unlock(); - } - Data* pop(){ - _lock.lock(); - if(_queue.size()>0) { - Data*d = _queue.front(); - _queue.pop(); - _lock.unlock(); - return d; - } - _lock.unlock(); - return NULL; - } - size_t size(){ return _queue.size();} - }; - - struct RtsDomain{ - _TaskQueue _WaitingQueue; - _TaskQueue _DataFetchingQueue; //used in Pull model - _TaskQueue _ReadyQueue; - _TaskQueue _RunningQueue; - _TaskQueue _ToCreateTaskQueue; - _TaskQueue _ToDestroyTaskQueue; - _MessageQueue _MsgQueue; - pthread_t *_threads; - _TaskQueue *_TaskBuffers; - int _size; - volatile int _activeSlaves; - MyLock _lock; - RtsDomain(){_threads=NULL; _size=0; _activeSlaves=0;}; - ~RtsDomain(){ - assert(_WaitingQueue.size()==0); - assert(_DataFetchingQueue.size()==0); - assert(_ReadyQueue.size()==0); - assert(_RunningQueue.size()==0); - assert(_ToCreateTaskQueue.size()==0); - assert(_ToDestroyTaskQueue.size()==0); - assert(_MsgQueue.size()==0); - free(_threads); - } - }; - int numa_nodes; - RtsDomain *dom; - int **_stopSignal; - AbstractTaskGraph* graph; - char* _DedicatedScheduler; - std::queue< std::pair > _SendRequests; - std::queue< std::pair > _RecvRequests; - std::queue _recvBuffers; - MyLock _l; -#define MAX_RECV_QUEUE 4 - - int RTS::ProcCount(){ - return _nProcs; - } - - int RTS::MyProc(){ - return _rank; - } - - int RTS::WorkerThreadCount(){ - return _nWrks; - } - - int RTS::MyWorkerThread(){ - return 0; - } - - struct argT { - int numaID; - int tid; - int nThreads; - }; - void run(void* threadInfo){ - argT *args= (argT*)threadInfo; - int numaID= args->numaID; - int tid= args->tid; - int nThreads= args->nThreads; - dom[numaID]._lock.lock(); - if(dom[numaID]._activeSlaves==0){ - _stopSignal[numaID]= new int[nThreads]; - } - _stopSignal[numaID][tid]=0; - dom[numaID]._activeSlaves++; - dom[numaID]._lock.unlock(); - if(dom[numaID]._TaskBuffers[tid].size()==0) dom[numaID]._TaskBuffers[tid].SetNoLoad(); - while(true){ - //if local task queue is empty, pull at most 2 tasks from the global queue - if(dom[numaID]._TaskBuffers[tid].size()==0){ - int nReadyTasks= dom[numaID]._ReadyQueue.size(); - if(nReadyTasks){ - Task* t= dom[numaID]._ReadyQueue.pop(); - if(t) dom[numaID]._TaskBuffers[tid].push(t); - if(dom[numaID]._ReadyQueue.size() >= nThreads){ //get one more task - Task* t1= dom[numaID]._ReadyQueue.pop(); - if(t1) dom[numaID]._TaskBuffers[tid].push(t1); - } - } - } - - if(dom[numaID]._TaskBuffers[tid].size()){ - Task* t= dom[numaID]._TaskBuffers[tid].pop(); - if(t){ - t->RunJob(); - t->RunPostCompletion(); - //Flush all outputs - while(t->GetOutputs().size()>0){ - Data* outdata= t->GetOutputs().front(); - t->GetOutputs().pop(); - if(outdata){ - TaskName dst= outdata->GetRecipient(); - int tag= outdata->GetTag(); - if(graph->LocateTask(dst)){ - graph->LocateTask(dst)->GetInputs().push_back(outdata->GetSource(), outdata, tag); - }else dom[numaID]._MsgQueue.push(outdata); - } - } - //process newly created tasks - while(t->GetNewTasks().size()>0){ - Task* nt= t->GetNewTasks().front(); - t->GetNewTasks().pop(); - dom[numaID]._ToCreateTaskQueue.push(nt); - } - //keep or destroy current task - if(t->isPersistent()){ - if(t->Dependency()){ - dom[numaID]._ReadyQueue.push(t); - }else{ - dom[numaID]._WaitingQueue.push(t); - } - }else{ - dom[numaID]._ToDestroyTaskQueue.push(t); - } - if(dom[numaID]._TaskBuffers[tid].size()==0){ - if(dom[numaID]._TaskBuffers[tid].NoLoad()==false) dom[numaID]._TaskBuffers[tid].SetNoLoad(); - } - } - } - if(_stopSignal[numaID][tid]) break; - } - free(args); - dom[numaID]._lock.lock(); - dom[numaID]._activeSlaves--; - if(dom[numaID]._activeSlaves==0){ - free(_stopSignal[numaID]); - } - dom[numaID]._lock.unlock(); - } - - void InitializeMPI(){ - int provided; - MPI_Init_thread(0, 0, MPI_THREAD_FUNNELED, &provided); - if(provided == MPI_THREAD_SINGLE){//with this MPI, process can't spawn threads - cerr << "Spawning threads is not allowed by the MPI implementation" << std::endl;; - } - } - - void RTS::RTS_Init(){ - NodeHardware hw = query_node_hardware(); - - assert(_nWrks>0 && _nWrks <= hw.core_per_numa * hw.numa_per_node); - - bool numaAware=true; - char* env= getenv("ENABLE_NUMA_AWARE"); - numaAware= (env!=NULL); - if(numaAware){ //the process covers multiple NUMA nodes - numa_nodes= hw.numa_per_node; - int worker_per_numa = _nWrks / numa_nodes; - int remainder= _nWrks % numa_nodes; - int r=0; - int base=0; - int localID=-1; - //create a list of persistent threads for each NUMA node - cpu_set_t cpuset; - pthread_attr_t attr; - pthread_attr_init(&attr); - dom= new RtsDomain[numa_nodes]; - _stopSignal= new int*[numa_nodes]; - for(int i=0; inumaID= domNo; - arg->tid= localID; - arg->nThreads= worker_per_numa+ (rnumaID= 0; - arg->tid= j; - arg->nThreads=_nWrks; - pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &mycpuset); - int err = pthread_create(&(dom[0]._threads[j]), &attr, (void*(*)(void*))run, arg); - }else dom[0]._threads[j]= pthread_self();// master thread - dom[0]._size++; - j++; - } - } - } - } - - void RTS::Init(){ - InitializeMPI(); - MPI_Comm_rank(MPI_COMM_WORLD, &_rank); - MPI_Comm_size(MPI_COMM_WORLD, &_nProcs); - RTS_Init(); - } - - void RTS::Init(int rank, int nProcs){ - _rank= rank; - _nProcs= nProcs; - RTS_Init(); - } - - void RTS::Finalize(){ - for(int d=0; d*)taskgraph; - //visit all initial tasks - { - Task* t= graph->Begin(); - int numaID=0; - while(t != graph->End()){ - if(graph->GetRunningMode()== _Push) - { - if(t->Dependency()){//all data have arrived - dom[numaID]._ReadyQueue.push(t); - }else{ - dom[numaID]._WaitingQueue.push(t); - } - }else{//Pull mode - dom[numaID]._DataFetchingQueue.push(t); - } - t = graph->Next(); - numaID= (numaID+1)%numa_nodes; //just use a simple round robin distribution for now - } - } - bool keepRunning=true; - //allocate a static buffer for incoming messages - size_t max_buf_size=2<<24; - if(env) max_buf_size= atoi(env); - for(int i=0; i< MAX_RECV_QUEUE; i++){ - char* _recvBuffer= new char[max_buf_size]; - _recvBuffers.push(_recvBuffer); - } - - dom[0]._TaskBuffers[0].SetNoLoad(); - while (keepRunning){ - //Handle communication - { - if(graph->GetRunningMode()== _Push) - { - //Process outgoing messages for all domains - for(int d=0; dGetRecipient(); - if(graph->LocateTask(name)){ - Task* t= graph->LocateTask(name); - t->GetInputs().push_back(msg->GetSource(), msg, msg->GetTag()); - } - else{ //Recipient is either on a remote node or has not been created - int destRank= msg->GetDestRank(); - if(destRank==-1) destRank= graph->FindProcessAssociation(name); //the runtime handles the mapping - if(destRank== MyProc()) dom[d]._MsgQueue.push(msg); //keep in local message queue since recipient task has not been created - else {//remote node - MPI_Request* req= new MPI_Request; - MPI_Isend(msg->SerializeData(), msg->GetSerializedSize(), MPI_CHAR, destRank, 0, MPI_COMM_WORLD, req); - _SendRequests.push(std::pair(req, msg)); - } - } - } - } - } - //prepost receives - if(_RecvRequests.size() < MAX_RECV_QUEUE){ - MPI_Request* req= new MPI_Request; - char* _recvBuffer=NULL; - if(_recvBuffers.size()){ - _recvBuffer= _recvBuffers.front(); - _recvBuffers.pop(); - }else _recvBuffer= new char[max_buf_size]; - MPI_Irecv(_recvBuffer, max_buf_size, MPI_CHAR, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, req); - _RecvRequests.push(std::pair(req, _recvBuffer)); - } - //check send status - int nSendRequests= _SendRequests.size(); - for(int i=0; i p= _SendRequests.front(); - MPI_Request *req= p.first; - _SendRequests.pop(); - MPI_Test(req, &done, MPI_STATUS_IGNORE); - if(done){ - Data* d= p.second; - d->Free(); - free(req); - }else _SendRequests.push(p); - } - //check recv status - int nRecvRequests= _RecvRequests.size(); - for(int i=0; i p= _RecvRequests.front(); - MPI_Request *req= p.first; - _RecvRequests.pop(); - MPI_Test(req, &done, MPI_STATUS_IGNORE); - if(done){ - Data* msg= new Data(p.second); //deserialize - TaskName name= msg->GetRecipient(); - TaskName src= msg->GetSource(); - Task* t= graph->LocateTask(name); - if(t){ - t->GetInputs().push_back(msg->GetSource(), msg, msg->GetTag()); - }else dom[0]._MsgQueue.push(msg); - free(req); - }else _RecvRequests.push(p); - } - }else{ - } - } - //visit waiting tasks in all domains - if(graph->GetRunningMode()== _Push) - { //no else - for(int d=0; dDependency()){ - dom[d]._ReadyQueue.push(t); - }else{ - dom[d]._WaitingQueue.push(t); - } - } - } - } - - if(!_DedicatedScheduler){ - //pull one task directly from global task queue - if(graph->GetRunningMode()== _Push){ - int nReadyTasks= dom[0]._ReadyQueue.size(); - if(nReadyTasks){ - Task* t= dom[0]._ReadyQueue.pop(); - if(t){ - t->RunJob(); - t->RunPostCompletion(); - //Flush all outputs - while(t->GetOutputs().size()>0){ - Data* outdata= t->GetOutputs().front(); - t->GetOutputs().pop(); - if(outdata){ - TaskName dst= outdata->GetRecipient(); - int tag= outdata->GetTag(); - if(graph->LocateTask(dst)){ - graph->LocateTask(dst)->GetInputs().push_back(outdata->GetSource(), outdata, tag); - }else dom[0]._MsgQueue.push(outdata); - } - } - //process newly created tasks for domain 0 - while(t->GetNewTasks().size()>0){ - Task* nt= t->GetNewTasks().front(); - t->GetNewTasks().pop(); - graph->GetTaskPool()[nt->MyName()]=nt; - if(nt->Dependency()){//all data have arrived - dom[0]._ReadyQueue.push(nt); - }else{ - dom[0]._WaitingQueue.push(nt); - } - } - //keep or destroy task for domain 0 - if(t->isPersistent()){ - if(t->Dependency()){ - dom[0]._ReadyQueue.push(t); - }else{ - dom[0]._WaitingQueue.push(t); - } - }else{ - //remove task from the task pool and delete it - graph->DestroyTask(t); - } - } - } - } - } - - - //service new task creation and destroy for other workers - for(int d=0; dGetTaskPool()[nt->MyName()]=nt; - if(nt->Dependency()){//all data have arrived - dom[d]._ReadyQueue.push(nt); - }else{ - dom[d]._WaitingQueue.push(nt); - } - } - } - } - for(int d=0; dDestroyTask(ot); - } - } - } - - keepRunning=false; - for(int d=0; dGetTaskPool().size() || dom[d]._ToCreateTaskQueue.size() || dom[d]._ToDestroyTaskQueue.size()) { - keepRunning=true; - break; - } - } - } - }//end while (keepRunning) - //cancel all unused preposted requests - while(_SendRequests.size()){ - MPI_Cancel(_SendRequests.front().first); - free(_SendRequests.front().first); - free(_SendRequests.front().second); - _SendRequests.pop(); - } - - //free recv buffers if any left - while(_recvBuffers.size()){ - free(_recvBuffers.front()); - _recvBuffers.pop(); - } - } - - const double kMicro = 1.0e-6; - double RTS::Time() - { - struct timeval TV; - - const int RC = gettimeofday(&TV, NULL); - if(RC == -1) - { - printf("ERROR: Bad call to gettimeofday\n"); - return(-1); - } - return( ((double)TV.tv_sec) + kMicro * ((double)TV.tv_usec) ); - } - - void RTS::Barrier(){ - //nothing - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/MPI_Generic/rts_graphimpl.H b/Src/AmrTask/rts_impls/MPI_Generic/rts_graphimpl.H deleted file mode 100644 index c4e2cab2ef7..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/rts_graphimpl.H +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef COLLECTIVE_IMPL -#define COLLECTIVE_IMPL - -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-21-2017 - -#include -#include -using namespace std; -#include -#include -using std::is_same; - -namespace amrex{ - - template - void ReductionSum_impl(T *local, T *global, int length, int root){ - MPI_Datatype datatype; - if(is_same::value) datatype= MPI_DOUBLE; - MPI_Reduce(local, global, length, datatype, MPI_SUM, root, MPI_COMM_WORLD); - } - -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/MPI_Generic/rts_taskimpl.H b/Src/AmrTask/rts_impls/MPI_Generic/rts_taskimpl.H deleted file mode 100644 index a12d1866c35..00000000000 --- a/Src/AmrTask/rts_impls/MPI_Generic/rts_taskimpl.H +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef MYATOMICS -#define MYATOMICS - -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-21-2017 - -#include -#include -using namespace std; -#include -#include "mylock.h" - -namespace amrex{ - extern MyLock _l; - template void LocalAtomicAdd_impl(T *addr, T val){ - _l.lock(); - *addr+= val; - _l.unlock(); - } - template void GlobalAtomicAdd_impl(T *addr, T val){ - assert(false);//not defined - } - -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/README b/Src/AmrTask/rts_impls/README deleted file mode 100644 index 7a2ef06c4e4..00000000000 --- a/Src/AmrTask/rts_impls/README +++ /dev/null @@ -1,6 +0,0 @@ -Given an AMReX task graph program, a runtime system operates the program execution. -In particular, the runtime system manages task creation and distribution, schedules tasks, and handles communication among tasks, etc. -Since a task graph representation exhibits partial orderings among tasks, the program can be operated in many ways. -Different runtime systems can employ different scheduling policies. -In this directory, we include interfaces to existing runtimes system. -The user can select one that suits well for a particular application and hardware architecture. diff --git a/Src/AmrTask/rts_impls/Serial/Makefile b/Src/AmrTask/rts_impls/Serial/Makefile deleted file mode 100755 index 9d581c7dd58..00000000000 --- a/Src/AmrTask/rts_impls/Serial/Makefile +++ /dev/null @@ -1,22 +0,0 @@ -include ../../arch.common - -OBJECTS= rts.o - -RTS_LIB= rts.a - -OBJECTS= rts.o - -all: $(RTS_LIB) - -$(RTS_LIB): $(OBJECTS) - ar rv $(RTS_LIB) $(OBJECTS) - -$(OBJECTS): rts.C - -rts.o: rts.C - $(C++) $(C++FLAGS) -I. -I$(INCLUDE) -I../../graph -c rts.C -o rts.o - -.PHONY: clean - -clean: - $(RM) $(OBJECTS) diff --git a/Src/AmrTask/rts_impls/Serial/rts.C b/Src/AmrTask/rts_impls/Serial/rts.C deleted file mode 100644 index ddd5604b30e..00000000000 --- a/Src/AmrTask/rts_impls/Serial/rts.C +++ /dev/null @@ -1,177 +0,0 @@ -#include "AMReX_AbstractTask.H" -#include "AMReX_TaskGraph.H" -#include "RTS.H" -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-21-2017 - -#include -#include -using namespace std; -#include - -namespace amrex{ - typedef std::queue _TaskQueue; - typedef std::queue _MessageQueue; - _TaskQueue _WaitingQueue; - _TaskQueue _DataFetchingQueue; //used in Pull model - _TaskQueue _ReadyQueue; - _TaskQueue _RunningQueue; - _MessageQueue _MsgQueue; - - int RTS::ProcCount(){ - return 1; - } - - int RTS::MyProc(){ - return 0; - } - - int RTS::WorkerThreadCount(){ - return 1; - } - - int RTS::MyWorkerThread(){ - return 0; - } - - void RTS::Init(){ - } - - void RTS::Init(int rank, int nProcs){ - _rank=0; - _nProcs=1; - } - - void RTS::Finalize(){ - //Now, no task should be alive. Thus, this routine check the content of all task queues. - assert(_WaitingQueue.size()==0); - assert(_DataFetchingQueue.size()==0); - assert(_ReadyQueue.size()==0); - assert(_RunningQueue.size()==0); - } - - void RTS::Iterate(void* taskgraph){ - AbstractTaskGraph* graph= (AbstractTaskGraph*)taskgraph; - //visit all initial tasks - { - Task* t= graph->Begin(); - while(t != graph->End()){ - if(graph->GetRunningMode()== _Push) - { - if(t->Dependency()){//all data have arrived - _ReadyQueue.push(t); - }else{ - _WaitingQueue.push(t); - } - }else{//Pull mode - _DataFetchingQueue.push(t); - } - t = graph->Next(); - } - } - bool keepRunning=true; - while (keepRunning){ - //Handle communication - { - if(graph->GetRunningMode()== _Push) - { - //Process messages - int nMsgs= _MsgQueue.size(); - for(int i=0; iGetRecipient(); - if(graph->LocateTask(name)){ - Task* t= graph->LocateTask(name); - t->GetInputs().push_back(msg->GetSource(), msg, msg->GetTag()); - } - else _MsgQueue.push(msg); //Recipient has not been created - } - }else{ - while(_DataFetchingQueue.size()){ - Task* t= _DataFetchingQueue.front(); - _DataFetchingQueue.pop(); - t->Dependency();//send active messages to pull data from source tasks - } - } - } - //visit waiting tasks (only in push mode) - if(graph->GetRunningMode()== _Push) - { //no else - int nWaitingTasks= _WaitingQueue.size(); - for(int i=0; iDependency()){ - _ReadyQueue.push(t); - }else{ - _WaitingQueue.push(t); - } - } - } - //Execute ready tasks - { - while(_ReadyQueue.size()){ - Task* t= _ReadyQueue.front(); - _ReadyQueue.pop(); - t->RunJob(); - t->RunPostCompletion(); - //Flush all outputs - while(t->GetOutputs().size()>0){ - Data* outdata= t->GetOutputs().front(); - t->GetOutputs().pop(); - TaskName dst= outdata->GetRecipient(); - int tag= outdata->GetTag(); - if(graph->LocateTask(dst)){ - graph->LocateTask(dst)->GetInputs().push_back(outdata->GetSource(), outdata, tag); - }else _MsgQueue.push(outdata); - } - //process newly created tasks - while(t->GetNewTasks().size()>0){ - Task* nt= t->GetNewTasks().front(); - t->GetNewTasks().pop(); - graph->GetTaskPool()[nt->MyName()]=nt; - if(nt->Dependency()){//all data have arrived - _ReadyQueue.push(nt); - }else{ - _WaitingQueue.push(nt); - } - } - //keep or destroy task - if(t->isPersistent()){ - if(t->Dependency()){ - _ReadyQueue.push(t); - }else{ - _WaitingQueue.push(t); - } - }else{ - //remove task from the task pool and delete it - graph->DestroyTask(t); - } - } - } - keepRunning= _WaitingQueue.size()>0 || _DataFetchingQueue.size()>0|| _ReadyQueue.size()>0|| _RunningQueue.size()>0|| _MsgQueue.size()>0 || graph->GetTaskPool().size()>0; - } - } - - const double kMicro = 1.0e-6; - double RTS::Time() - { - struct timeval TV; - - const int RC = gettimeofday(&TV, NULL); - if(RC == -1) - { - printf("ERROR: Bad call to gettimeofday\n"); - return(-1); - } - return( ((double)TV.tv_sec) + kMicro * ((double)TV.tv_usec) ); - } - - void RTS::Barrier(){ - //nothing - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/Serial/rts_graphimpl.H b/Src/AmrTask/rts_impls/Serial/rts_graphimpl.H deleted file mode 100644 index 8563dd81626..00000000000 --- a/Src/AmrTask/rts_impls/Serial/rts_graphimpl.H +++ /dev/null @@ -1,18 +0,0 @@ -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 07-21-2017 - -#include -#include -using namespace std; -#include - -namespace amrex{ - - template - void ReductionSum_impl(T *local, T *global, int length, int root){ - for(int i=0; i -#include -using namespace std; -#include - -namespace amrex{ - - template void LocalAtomicAdd_impl(T *addr, T val){ - *addr+= val; - } - template void GlobalAtomicAdd_impl(T *addr, T val){ - *addr+= val; - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/Utils/dl_malloc.c b/Src/AmrTask/rts_impls/Utils/dl_malloc.c deleted file mode 100644 index 27bce62d3ed..00000000000 --- a/Src/AmrTask/rts_impls/Utils/dl_malloc.c +++ /dev/null @@ -1,6326 +0,0 @@ -/* - This is a version (aka dlmalloc) of malloc/free/realloc written by - Doug Lea and released to the public domain, as explained at - http://creativecommons.org/publicdomain/zero/1.0/ Send questions, - comments, complaints, performance data, etc to dl@cs.oswego.edu - -* Version 2.8.6 Wed Aug 29 06:57:58 2012 Doug Lea - Note: There may be an updated version of this malloc obtainable at - ftp://gee.cs.oswego.edu/pub/misc/malloc.c - Check before installing! - -* Quickstart - - This library is all in one file to simplify the most common usage: - ftp it, compile it (-O3), and link it into another program. All of - the compile-time options default to reasonable values for use on - most platforms. You might later want to step through various - compile-time and dynamic tuning options. - - For convenience, an include file for code using this malloc is at: - ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.6.h - You don't really need this .h file unless you call functions not - defined in your system include files. The .h file contains only the - excerpts from this file needed for using this malloc on ANSI C/C++ - systems, so long as you haven't changed compile-time options about - naming and tuning parameters. If you do, then you can create your - own malloc.h that does include all settings by cutting at the point - indicated below. Note that you may already by default be using a C - library containing a malloc that is based on some version of this - malloc (for example in linux). You might still want to use the one - in this file to customize settings or to avoid overheads associated - with library versions. - -* Vital statistics: - - Supported pointer/size_t representation: 4 or 8 bytes - size_t MUST be an unsigned type of the same width as - pointers. (If you are using an ancient system that declares - size_t as a signed type, or need it to be a different width - than pointers, you can use a previous release of this malloc - (e.g. 2.7.2) supporting these.) - - Alignment: 8 bytes (minimum) - This suffices for nearly all current machines and C compilers. - However, you can define MALLOC_ALIGNMENT to be wider than this - if necessary (up to 128bytes), at the expense of using more space. - - Minimum overhead per allocated chunk: 4 or 8 bytes (if 4byte sizes) - 8 or 16 bytes (if 8byte sizes) - Each malloced chunk has a hidden word of overhead holding size - and status information, and additional cross-check word - if FOOTERS is defined. - - Minimum allocated size: 4-byte ptrs: 16 bytes (including overhead) - 8-byte ptrs: 32 bytes (including overhead) - - Even a request for zero bytes (i.e., malloc(0)) returns a - pointer to something of the minimum allocatable size. - The maximum overhead wastage (i.e., number of extra bytes - allocated than were requested in malloc) is less than or equal - to the minimum size, except for requests >= mmap_threshold that - are serviced via mmap(), where the worst case wastage is about - 32 bytes plus the remainder from a system page (the minimal - mmap unit); typically 4096 or 8192 bytes. - - Security: static-safe; optionally more or less - The "security" of malloc refers to the ability of malicious - code to accentuate the effects of errors (for example, freeing - space that is not currently malloc'ed or overwriting past the - ends of chunks) in code that calls malloc. This malloc - guarantees not to modify any memory locations below the base of - heap, i.e., static variables, even in the presence of usage - errors. The routines additionally detect most improper frees - and reallocs. All this holds as long as the static bookkeeping - for malloc itself is not corrupted by some other means. This - is only one aspect of security -- these checks do not, and - cannot, detect all possible programming errors. - - If FOOTERS is defined nonzero, then each allocated chunk - carries an additional check word to verify that it was malloced - from its space. These check words are the same within each - execution of a program using malloc, but differ across - executions, so externally crafted fake chunks cannot be - freed. This improves security by rejecting frees/reallocs that - could corrupt heap memory, in addition to the checks preventing - writes to statics that are always on. This may further improve - security at the expense of time and space overhead. (Note that - FOOTERS may also be worth using with MSPACES.) - - By default detected errors cause the program to abort (calling - "abort()"). You can override this to instead proceed past - errors by defining PROCEED_ON_ERROR. In this case, a bad free - has no effect, and a malloc that encounters a bad address - caused by user overwrites will ignore the bad address by - dropping pointers and indices to all known memory. This may - be appropriate for programs that should continue if at all - possible in the face of programming errors, although they may - run out of memory because dropped memory is never reclaimed. - - If you don't like either of these options, you can define - CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything - else. And if if you are sure that your program using malloc has - no errors or vulnerabilities, you can define INSECURE to 1, - which might (or might not) provide a small performance improvement. - - It is also possible to limit the maximum total allocatable - space, using malloc_set_footprint_limit. This is not - designed as a security feature in itself (calls to set limits - are not screened or privileged), but may be useful as one - aspect of a secure implementation. - - Thread-safety: NOT thread-safe unless USE_LOCKS defined non-zero - When USE_LOCKS is defined, each public call to malloc, free, - etc is surrounded with a lock. By default, this uses a plain - pthread mutex, win32 critical section, or a spin-lock if if - available for the platform and not disabled by setting - USE_SPIN_LOCKS=0. However, if USE_RECURSIVE_LOCKS is defined, - recursive versions are used instead (which are not required for - base functionality but may be needed in layered extensions). - Using a global lock is not especially fast, and can be a major - bottleneck. It is designed only to provide minimal protection - in concurrent environments, and to provide a basis for - extensions. If you are using malloc in a concurrent program, - consider instead using nedmalloc - (http://www.nedprod.com/programs/portable/nedmalloc/) or - ptmalloc (See http://www.malloc.de), which are derived from - versions of this malloc. - - System requirements: Any combination of MORECORE and/or MMAP/MUNMAP - This malloc can use unix sbrk or any emulation (invoked using - the CALL_MORECORE macro) and/or mmap/munmap or any emulation - (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system - memory. On most unix systems, it tends to work best if both - MORECORE and MMAP are enabled. On Win32, it uses emulations - based on VirtualAlloc. It also uses common C library functions - like memset. - - Compliance: I believe it is compliant with the Single Unix Specification - (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably - others as well. - -* Overview of algorithms - - This is not the fastest, most space-conserving, most portable, or - most tunable malloc ever written. However it is among the fastest - while also being among the most space-conserving, portable and - tunable. Consistent balance across these factors results in a good - general-purpose allocator for malloc-intensive programs. - - In most ways, this malloc is a best-fit allocator. Generally, it - chooses the best-fitting existing chunk for a request, with ties - broken in approximately least-recently-used order. (This strategy - normally maintains low fragmentation.) However, for requests less - than 256bytes, it deviates from best-fit when there is not an - exactly fitting available chunk by preferring to use space adjacent - to that used for the previous small request, as well as by breaking - ties in approximately most-recently-used order. (These enhance - locality of series of small allocations.) And for very large requests - (>= 256Kb by default), it relies on system memory mapping - facilities, if supported. (This helps avoid carrying around and - possibly fragmenting memory used only for large chunks.) - - All operations (except malloc_stats and mallinfo) have execution - times that are bounded by a constant factor of the number of bits in - a size_t, not counting any clearing in calloc or copying in realloc, - or actions surrounding MORECORE and MMAP that have times - proportional to the number of non-contiguous regions returned by - system allocation routines, which is often just 1. In real-time - applications, you can optionally suppress segment traversals using - NO_SEGMENT_TRAVERSAL, which assures bounded execution even when - system allocators return non-contiguous spaces, at the typical - expense of carrying around more memory and increased fragmentation. - - The implementation is not very modular and seriously overuses - macros. Perhaps someday all C compilers will do as good a job - inlining modular code as can now be done by brute-force expansion, - but now, enough of them seem not to. - - Some compilers issue a lot of warnings about code that is - dead/unreachable only on some platforms, and also about intentional - uses of negation on unsigned types. All known cases of each can be - ignored. - - For a longer but out of date high-level description, see - http://gee.cs.oswego.edu/dl/html/malloc.html - -* MSPACES - If MSPACES is defined, then in addition to malloc, free, etc., - this file also defines mspace_malloc, mspace_free, etc. These - are versions of malloc routines that take an "mspace" argument - obtained using create_mspace, to control all internal bookkeeping. - If ONLY_MSPACES is defined, only these versions are compiled. - So if you would like to use this allocator for only some allocations, - and your system malloc for others, you can compile with - ONLY_MSPACES and then do something like... - static mspace mymspace = create_mspace(0,0); // for example - #define mymalloc(bytes) mspace_malloc(mymspace, bytes) - - (Note: If you only need one instance of an mspace, you can instead - use "USE_DL_PREFIX" to relabel the global malloc.) - - You can similarly create thread-local allocators by storing - mspaces as thread-locals. For example: - static __thread mspace tlms = 0; - void* tlmalloc(size_t bytes) { - if (tlms == 0) tlms = create_mspace(0, 0); - return mspace_malloc(tlms, bytes); - } - void tlfree(void* mem) { mspace_free(tlms, mem); } - - Unless FOOTERS is defined, each mspace is completely independent. - You cannot allocate from one and free to another (although - conformance is only weakly checked, so usage errors are not always - caught). If FOOTERS is defined, then each chunk carries around a tag - indicating its originating mspace, and frees are directed to their - originating spaces. Normally, this requires use of locks. - - ------------------------- Compile-time options --------------------------- - -Be careful in setting #define values for numerical constants of type -size_t. On some systems, literal values are not automatically extended -to size_t precision unless they are explicitly casted. You can also -use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below. - -WIN32 default: defined if _WIN32 defined - Defining WIN32 sets up defaults for MS environment and compilers. - Otherwise defaults are for unix. Beware that there seem to be some - cases where this malloc might not be a pure drop-in replacement for - Win32 malloc: Random-looking failures from Win32 GDI API's (eg; - SetDIBits()) may be due to bugs in some video driver implementations - when pixel buffers are malloc()ed, and the region spans more than - one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb) - default granularity, pixel buffers may straddle virtual allocation - regions more often than when using the Microsoft allocator. You can - avoid this by using VirtualAlloc() and VirtualFree() for all pixel - buffers rather than using malloc(). If this is not possible, - recompile this malloc with a larger DEFAULT_GRANULARITY. Note: - in cases where MSC and gcc (cygwin) are known to differ on WIN32, - conditions use _MSC_VER to distinguish them. - -DLMALLOC_EXPORT default: extern - Defines how public APIs are declared. If you want to export via a - Windows DLL, you might define this as - #define DLMALLOC_EXPORT extern __declspec(dllexport) - If you want a POSIX ELF shared object, you might use - #define DLMALLOC_EXPORT extern __attribute__((visibility("default"))) - -MALLOC_ALIGNMENT default: (size_t)(2 * sizeof(void *)) - Controls the minimum alignment for malloc'ed chunks. It must be a - power of two and at least 8, even on machines for which smaller - alignments would suffice. It may be defined as larger than this - though. Note however that code and data structures are optimized for - the case of 8-byte alignment. - -MSPACES default: 0 (false) - If true, compile in support for independent allocation spaces. - This is only supported if HAVE_MMAP is true. - -ONLY_MSPACES default: 0 (false) - If true, only compile in mspace versions, not regular versions. - -USE_LOCKS default: 0 (false) - Causes each call to each public routine to be surrounded with - pthread or WIN32 mutex lock/unlock. (If set true, this can be - overridden on a per-mspace basis for mspace versions.) If set to a - non-zero value other than 1, locks are used, but their - implementation is left out, so lock functions must be supplied manually, - as described below. - -USE_SPIN_LOCKS default: 1 iff USE_LOCKS and spin locks available - If true, uses custom spin locks for locking. This is currently - supported only gcc >= 4.1, older gccs on x86 platforms, and recent - MS compilers. Otherwise, posix locks or win32 critical sections are - used. - -USE_RECURSIVE_LOCKS default: not defined - If defined nonzero, uses recursive (aka reentrant) locks, otherwise - uses plain mutexes. This is not required for malloc proper, but may - be needed for layered allocators such as nedmalloc. - -LOCK_AT_FORK default: not defined - If defined nonzero, performs pthread_atfork upon initialization - to initialize child lock while holding parent lock. The implementation - assumes that pthread locks (not custom locks) are being used. In other - cases, you may need to customize the implementation. - -FOOTERS default: 0 - If true, provide extra checking and dispatching by placing - information in the footers of allocated chunks. This adds - space and time overhead. - -INSECURE default: 0 - If true, omit checks for usage errors and heap space overwrites. - -USE_DL_PREFIX default: NOT defined - Causes compiler to prefix all public routines with the string 'dl'. - This can be useful when you only want to use this malloc in one part - of a program, using your regular system malloc elsewhere. - -MALLOC_INSPECT_ALL default: NOT defined - If defined, compiles malloc_inspect_all and mspace_inspect_all, that - perform traversal of all heap space. Unless access to these - functions is otherwise restricted, you probably do not want to - include them in secure implementations. - -ABORT default: defined as abort() - Defines how to abort on failed checks. On most systems, a failed - check cannot die with an "assert" or even print an informative - message, because the underlying print routines in turn call malloc, - which will fail again. Generally, the best policy is to simply call - abort(). It's not very useful to do more than this because many - errors due to overwriting will show up as address faults (null, odd - addresses etc) rather than malloc-triggered checks, so will also - abort. Also, most compilers know that abort() does not return, so - can better optimize code conditionally calling it. - -PROCEED_ON_ERROR default: defined as 0 (false) - Controls whether detected bad addresses cause them to bypassed - rather than aborting. If set, detected bad arguments to free and - realloc are ignored. And all bookkeeping information is zeroed out - upon a detected overwrite of freed heap space, thus losing the - ability to ever return it from malloc again, but enabling the - application to proceed. If PROCEED_ON_ERROR is defined, the - static variable malloc_corruption_error_count is compiled in - and can be examined to see if errors have occurred. This option - generates slower code than the default abort policy. - -DEBUG default: NOT defined - The DEBUG setting is mainly intended for people trying to modify - this code or diagnose problems when porting to new platforms. - However, it may also be able to better isolate user errors than just - using runtime checks. The assertions in the check routines spell - out in more detail the assumptions and invariants underlying the - algorithms. The checking is fairly extensive, and will slow down - execution noticeably. Calling malloc_stats or mallinfo with DEBUG - set will attempt to check every non-mmapped allocated and free chunk - in the course of computing the summaries. - -ABORT_ON_ASSERT_FAILURE default: defined as 1 (true) - Debugging assertion failures can be nearly impossible if your - version of the assert macro causes malloc to be called, which will - lead to a cascade of further failures, blowing the runtime stack. - ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(), - which will usually make debugging easier. - -MALLOC_FAILURE_ACTION default: sets errno to ENOMEM, or no-op on win32 - The action to take before "return 0" when malloc fails to be able to - return memory because there is none available. - -HAVE_MORECORE default: 1 (true) unless win32 or ONLY_MSPACES - True if this system supports sbrk or an emulation of it. - -MORECORE default: sbrk - The name of the sbrk-style system routine to call to obtain more - memory. See below for guidance on writing custom MORECORE - functions. The type of the argument to sbrk/MORECORE varies across - systems. It cannot be size_t, because it supports negative - arguments, so it is normally the signed type of the same width as - size_t (sometimes declared as "intptr_t"). It doesn't much matter - though. Internally, we only call it with arguments less than half - the max value of a size_t, which should work across all reasonable - possibilities, although sometimes generating compiler warnings. - -MORECORE_CONTIGUOUS default: 1 (true) if HAVE_MORECORE - If true, take advantage of fact that consecutive calls to MORECORE - with positive arguments always return contiguous increasing - addresses. This is true of unix sbrk. It does not hurt too much to - set it true anyway, since malloc copes with non-contiguities. - Setting it false when definitely non-contiguous saves time - and possibly wasted space it would take to discover this though. - -MORECORE_CANNOT_TRIM default: NOT defined - True if MORECORE cannot release space back to the system when given - negative arguments. This is generally necessary only if you are - using a hand-crafted MORECORE function that cannot handle negative - arguments. - -NO_SEGMENT_TRAVERSAL default: 0 - If non-zero, suppresses traversals of memory segments - returned by either MORECORE or CALL_MMAP. This disables - merging of segments that are contiguous, and selectively - releasing them to the OS if unused, but bounds execution times. - -HAVE_MMAP default: 1 (true) - True if this system supports mmap or an emulation of it. If so, and - HAVE_MORECORE is not true, MMAP is used for all system - allocation. If set and HAVE_MORECORE is true as well, MMAP is - primarily used to directly allocate very large blocks. It is also - used as a backup strategy in cases where MORECORE fails to provide - space from system. Note: A single call to MUNMAP is assumed to be - able to unmap memory that may have be allocated using multiple calls - to MMAP, so long as they are adjacent. - -HAVE_MREMAP default: 1 on linux, else 0 - If true realloc() uses mremap() to re-allocate large blocks and - extend or shrink allocation spaces. - -MMAP_CLEARS default: 1 except on WINCE. - True if mmap clears memory so calloc doesn't need to. This is true - for standard unix mmap using /dev/zero and on WIN32 except for WINCE. - -USE_BUILTIN_FFS default: 0 (i.e., not used) - Causes malloc to use the builtin ffs() function to compute indices. - Some compilers may recognize and intrinsify ffs to be faster than the - supplied C version. Also, the case of x86 using gcc is special-cased - to an asm instruction, so is already as fast as it can be, and so - this setting has no effect. Similarly for Win32 under recent MS compilers. - (On most x86s, the asm version is only slightly faster than the C version.) - -malloc_getpagesize default: derive from system includes, or 4096. - The system page size. To the extent possible, this malloc manages - memory from the system in page-size units. This may be (and - usually is) a function rather than a constant. This is ignored - if WIN32, where page size is determined using getSystemInfo during - initialization. - -USE_DEV_RANDOM default: 0 (i.e., not used) - Causes malloc to use /dev/random to initialize secure magic seed for - stamping footers. Otherwise, the current time is used. - -NO_MALLINFO default: 0 - If defined, don't compile "mallinfo". This can be a simple way - of dealing with mismatches between system declarations and - those in this file. - -MALLINFO_FIELD_TYPE default: size_t - The type of the fields in the mallinfo struct. This was originally - defined as "int" in SVID etc, but is more usefully defined as - size_t. The value is used only if HAVE_USR_INCLUDE_MALLOC_H is not set - -NO_MALLOC_STATS default: 0 - If defined, don't compile "malloc_stats". This avoids calls to - fprintf and bringing in stdio dependencies you might not want. - -REALLOC_ZERO_BYTES_FREES default: not defined - This should be set if a call to realloc with zero bytes should - be the same as a call to free. Some people think it should. Otherwise, - since this malloc returns a unique pointer for malloc(0), so does - realloc(p, 0). - -LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H -LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H, LACKS_ERRNO_H -LACKS_STDLIB_H LACKS_SCHED_H LACKS_TIME_H default: NOT defined unless on WIN32 - Define these if your system does not have these header files. - You might need to manually insert some of the declarations they provide. - -DEFAULT_GRANULARITY default: page size if MORECORE_CONTIGUOUS, - system_info.dwAllocationGranularity in WIN32, - otherwise 64K. - Also settable using mallopt(M_GRANULARITY, x) - The unit for allocating and deallocating memory from the system. On - most systems with contiguous MORECORE, there is no reason to - make this more than a page. However, systems with MMAP tend to - either require or encourage larger granularities. You can increase - this value to prevent system allocation functions to be called so - often, especially if they are slow. The value must be at least one - page and must be a power of two. Setting to 0 causes initialization - to either page size or win32 region size. (Note: In previous - versions of malloc, the equivalent of this option was called - "TOP_PAD") - -DEFAULT_TRIM_THRESHOLD default: 2MB - Also settable using mallopt(M_TRIM_THRESHOLD, x) - The maximum amount of unused top-most memory to keep before - releasing via malloc_trim in free(). Automatic trimming is mainly - useful in long-lived programs using contiguous MORECORE. Because - trimming via sbrk can be slow on some systems, and can sometimes be - wasteful (in cases where programs immediately afterward allocate - more large chunks) the value should be high enough so that your - overall system performance would improve by releasing this much - memory. As a rough guide, you might set to a value close to the - average size of a process (program) running on your system. - Releasing this much memory would allow such a process to run in - memory. Generally, it is worth tuning trim thresholds when a - program undergoes phases where several large chunks are allocated - and released in ways that can reuse each other's storage, perhaps - mixed with phases where there are no such chunks at all. The trim - value must be greater than page size to have any useful effect. To - disable trimming completely, you can set to MAX_SIZE_T. Note that the trick - some people use of mallocing a huge space and then freeing it at - program startup, in an attempt to reserve system memory, doesn't - have the intended effect under automatic trimming, since that memory - will immediately be returned to the system. - -DEFAULT_MMAP_THRESHOLD default: 256K - Also settable using mallopt(M_MMAP_THRESHOLD, x) - The request size threshold for using MMAP to directly service a - request. Requests of at least this size that cannot be allocated - using already-existing space will be serviced via mmap. (If enough - normal freed space already exists it is used instead.) Using mmap - segregates relatively large chunks of memory so that they can be - individually obtained and released from the host system. A request - serviced through mmap is never reused by any other request (at least - not directly; the system may just so happen to remap successive - requests to the same locations). Segregating space in this way has - the benefits that: Mmapped space can always be individually released - back to the system, which helps keep the system level memory demands - of a long-lived program low. Also, mapped memory doesn't become - `locked' between other chunks, as can happen with normally allocated - chunks, which means that even trimming via malloc_trim would not - release them. However, it has the disadvantage that the space - cannot be reclaimed, consolidated, and then used to service later - requests, as happens with normal chunks. The advantages of mmap - nearly always outweigh disadvantages for "large" chunks, but the - value of "large" may vary across systems. The default is an - empirically derived value that works well in most systems. You can - disable mmap by setting to MAX_SIZE_T. - -MAX_RELEASE_CHECK_RATE default: 4095 unless not HAVE_MMAP - The number of consolidated frees between checks to release - unused segments when freeing. When using non-contiguous segments, - especially with multiple mspaces, checking only for topmost space - doesn't always suffice to trigger trimming. To compensate for this, - free() will, with a period of MAX_RELEASE_CHECK_RATE (or the - current number of segments, if greater) try to release unused - segments to the OS when freeing chunks that result in - consolidation. The best value for this parameter is a compromise - between slowing down frees with relatively costly checks that - rarely trigger versus holding on to unused memory. To effectively - disable, set to MAX_SIZE_T. This may lead to a very slight speed - improvement at the expense of carrying around more memory. -*/ - -/* Version identifier to allow people to support multiple versions */ -#ifndef DLMALLOC_VERSION -#define DLMALLOC_VERSION 20806 -#endif /* DLMALLOC_VERSION */ - -#ifndef DLMALLOC_EXPORT -#define DLMALLOC_EXPORT extern -#endif - -#ifndef WIN32 -#ifdef _WIN32 -#define WIN32 1 -#endif /* _WIN32 */ -#ifdef _WIN32_WCE -#define LACKS_FCNTL_H -#define WIN32 1 -#endif /* _WIN32_WCE */ -#endif /* WIN32 */ -#ifdef WIN32 -#define WIN32_LEAN_AND_MEAN -#include -#include -#define HAVE_MMAP 1 -#define HAVE_MORECORE 0 -#define LACKS_UNISTD_H -#define LACKS_SYS_PARAM_H -#define LACKS_SYS_MMAN_H -#define LACKS_STRING_H -#define LACKS_STRINGS_H -#define LACKS_SYS_TYPES_H -#define LACKS_ERRNO_H -#define LACKS_SCHED_H -#ifndef MALLOC_FAILURE_ACTION -#define MALLOC_FAILURE_ACTION -#endif /* MALLOC_FAILURE_ACTION */ -#ifndef MMAP_CLEARS -#ifdef _WIN32_WCE /* WINCE reportedly does not clear */ -#define MMAP_CLEARS 0 -#else -#define MMAP_CLEARS 1 -#endif /* _WIN32_WCE */ -#endif /*MMAP_CLEARS */ -#endif /* WIN32 */ - -#if defined(DARWIN) || defined(_DARWIN) -/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */ -#ifndef HAVE_MORECORE -#define HAVE_MORECORE 0 -#define HAVE_MMAP 1 -/* OSX allocators provide 16 byte alignment */ -#ifndef MALLOC_ALIGNMENT -#define MALLOC_ALIGNMENT ((size_t)16U) -#endif -#endif /* HAVE_MORECORE */ -#endif /* DARWIN */ - -#ifndef LACKS_SYS_TYPES_H -#include /* For size_t */ -#endif /* LACKS_SYS_TYPES_H */ - -/* The maximum possible size_t value has all bits set */ -#define MAX_SIZE_T (~(size_t)0) - -#ifndef USE_LOCKS /* ensure true if spin or recursive locks set */ -#define USE_LOCKS ((defined(USE_SPIN_LOCKS) && USE_SPIN_LOCKS != 0) || \ - (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0)) -#endif /* USE_LOCKS */ - -#if USE_LOCKS /* Spin locks for gcc >= 4.1, older gcc on x86, MSC >= 1310 */ -#if ((defined(__GNUC__) && \ - ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) || \ - defined(__i386__) || defined(__x86_64__))) || \ - (defined(_MSC_VER) && _MSC_VER>=1310)) -#ifndef USE_SPIN_LOCKS -#define USE_SPIN_LOCKS 1 -#endif /* USE_SPIN_LOCKS */ -#elif USE_SPIN_LOCKS -#error "USE_SPIN_LOCKS defined without implementation" -#endif /* ... locks available... */ -#elif !defined(USE_SPIN_LOCKS) -#define USE_SPIN_LOCKS 0 -#endif /* USE_LOCKS */ - -#ifndef ONLY_MSPACES -#define ONLY_MSPACES 0 -#endif /* ONLY_MSPACES */ -#ifndef MSPACES -#if ONLY_MSPACES -#define MSPACES 1 -#else /* ONLY_MSPACES */ -#define MSPACES 0 -#endif /* ONLY_MSPACES */ -#endif /* MSPACES */ -#ifndef MALLOC_ALIGNMENT -#define MALLOC_ALIGNMENT ((size_t)(2 * sizeof(void *))) -#endif /* MALLOC_ALIGNMENT */ -#ifndef FOOTERS -#define FOOTERS 0 -#endif /* FOOTERS */ -#ifndef ABORT -#define ABORT abort() -#endif /* ABORT */ -#ifndef ABORT_ON_ASSERT_FAILURE -#define ABORT_ON_ASSERT_FAILURE 1 -#endif /* ABORT_ON_ASSERT_FAILURE */ -#ifndef PROCEED_ON_ERROR -#define PROCEED_ON_ERROR 0 -#endif /* PROCEED_ON_ERROR */ - -#ifndef INSECURE -#define INSECURE 0 -#endif /* INSECURE */ -#ifndef MALLOC_INSPECT_ALL -#define MALLOC_INSPECT_ALL 0 -#endif /* MALLOC_INSPECT_ALL */ -#ifndef HAVE_MMAP -#define HAVE_MMAP 1 -#endif /* HAVE_MMAP */ -#ifndef MMAP_CLEARS -#define MMAP_CLEARS 1 -#endif /* MMAP_CLEARS */ -#ifndef HAVE_MREMAP -#ifdef linux -#define HAVE_MREMAP 1 -#define _GNU_SOURCE /* Turns on mremap() definition */ -#else /* linux */ -#define HAVE_MREMAP 0 -#endif /* linux */ -#endif /* HAVE_MREMAP */ -#ifndef MALLOC_FAILURE_ACTION -#define MALLOC_FAILURE_ACTION errno = ENOMEM; -#endif /* MALLOC_FAILURE_ACTION */ -#ifndef HAVE_MORECORE -#if ONLY_MSPACES -#define HAVE_MORECORE 0 -#else /* ONLY_MSPACES */ -#define HAVE_MORECORE 1 -#endif /* ONLY_MSPACES */ -#endif /* HAVE_MORECORE */ -#if !HAVE_MORECORE -#define MORECORE_CONTIGUOUS 0 -#else /* !HAVE_MORECORE */ -#define MORECORE_DEFAULT sbrk -#ifndef MORECORE_CONTIGUOUS -#define MORECORE_CONTIGUOUS 1 -#endif /* MORECORE_CONTIGUOUS */ -#endif /* HAVE_MORECORE */ -#ifndef DEFAULT_GRANULARITY -#if (MORECORE_CONTIGUOUS || defined(WIN32)) -#define DEFAULT_GRANULARITY (0) /* 0 means to compute in init_mparams */ -#else /* MORECORE_CONTIGUOUS */ -#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U) -#endif /* MORECORE_CONTIGUOUS */ -#endif /* DEFAULT_GRANULARITY */ -#ifndef DEFAULT_TRIM_THRESHOLD -#ifndef MORECORE_CANNOT_TRIM -#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U) -#else /* MORECORE_CANNOT_TRIM */ -#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T -#endif /* MORECORE_CANNOT_TRIM */ -#endif /* DEFAULT_TRIM_THRESHOLD */ -#ifndef DEFAULT_MMAP_THRESHOLD -#if HAVE_MMAP -#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U) -#else /* HAVE_MMAP */ -#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T -#endif /* HAVE_MMAP */ -#endif /* DEFAULT_MMAP_THRESHOLD */ -#ifndef MAX_RELEASE_CHECK_RATE -#if HAVE_MMAP -#define MAX_RELEASE_CHECK_RATE 4095 -#else -#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T -#endif /* HAVE_MMAP */ -#endif /* MAX_RELEASE_CHECK_RATE */ -#ifndef USE_BUILTIN_FFS -#define USE_BUILTIN_FFS 0 -#endif /* USE_BUILTIN_FFS */ -#ifndef USE_DEV_RANDOM -#define USE_DEV_RANDOM 0 -#endif /* USE_DEV_RANDOM */ -#ifndef NO_MALLINFO -#define NO_MALLINFO 0 -#endif /* NO_MALLINFO */ -#ifndef MALLINFO_FIELD_TYPE -#define MALLINFO_FIELD_TYPE size_t -#endif /* MALLINFO_FIELD_TYPE */ -#ifndef NO_MALLOC_STATS -#define NO_MALLOC_STATS 0 -#endif /* NO_MALLOC_STATS */ -#ifndef NO_SEGMENT_TRAVERSAL -#define NO_SEGMENT_TRAVERSAL 0 -#endif /* NO_SEGMENT_TRAVERSAL */ - -/* - mallopt tuning options. SVID/XPG defines four standard parameter - numbers for mallopt, normally defined in malloc.h. None of these - are used in this malloc, so setting them has no effect. But this - malloc does support the following options. -*/ - -#define M_TRIM_THRESHOLD (-1) -#define M_GRANULARITY (-2) -#define M_MMAP_THRESHOLD (-3) - -/* ------------------------ Mallinfo declarations ------------------------ */ - -#if !NO_MALLINFO -/* - This version of malloc supports the standard SVID/XPG mallinfo - routine that returns a struct containing usage properties and - statistics. It should work on any system that has a - /usr/include/malloc.h defining struct mallinfo. The main - declaration needed is the mallinfo struct that is returned (by-copy) - by mallinfo(). The malloinfo struct contains a bunch of fields that - are not even meaningful in this version of malloc. These fields are - are instead filled by mallinfo() with other numbers that might be of - interest. - - HAVE_USR_INCLUDE_MALLOC_H should be set if you have a - /usr/include/malloc.h file that includes a declaration of struct - mallinfo. If so, it is included; else a compliant version is - declared below. These must be precisely the same for mallinfo() to - work. The original SVID version of this struct, defined on most - systems with mallinfo, declares all fields as ints. But some others - define as unsigned long. If your system defines the fields using a - type of different width than listed here, you MUST #include your - system version and #define HAVE_USR_INCLUDE_MALLOC_H. -*/ - -/* #define HAVE_USR_INCLUDE_MALLOC_H */ - -#ifdef HAVE_USR_INCLUDE_MALLOC_H -#include "/usr/include/malloc.h" -#else /* HAVE_USR_INCLUDE_MALLOC_H */ -#ifndef STRUCT_MALLINFO_DECLARED -/* HP-UX (and others?) redefines mallinfo unless _STRUCT_MALLINFO is defined */ -#define _STRUCT_MALLINFO -#define STRUCT_MALLINFO_DECLARED 1 -struct mallinfo { - MALLINFO_FIELD_TYPE arena; /* non-mmapped space allocated from system */ - MALLINFO_FIELD_TYPE ordblks; /* number of free chunks */ - MALLINFO_FIELD_TYPE smblks; /* always 0 */ - MALLINFO_FIELD_TYPE hblks; /* always 0 */ - MALLINFO_FIELD_TYPE hblkhd; /* space in mmapped regions */ - MALLINFO_FIELD_TYPE usmblks; /* maximum total allocated space */ - MALLINFO_FIELD_TYPE fsmblks; /* always 0 */ - MALLINFO_FIELD_TYPE uordblks; /* total allocated space */ - MALLINFO_FIELD_TYPE fordblks; /* total free space */ - MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */ -}; -#endif /* STRUCT_MALLINFO_DECLARED */ -#endif /* HAVE_USR_INCLUDE_MALLOC_H */ -#endif /* NO_MALLINFO */ - -/* - Try to persuade compilers to inline. The most critical functions for - inlining are defined as macros, so these aren't used for them. -*/ - -#ifndef FORCEINLINE - #if defined(__GNUC__) -#define FORCEINLINE __inline __attribute__ ((always_inline)) - #elif defined(_MSC_VER) - #define FORCEINLINE __forceinline - #endif -#endif -#ifndef NOINLINE - #if defined(__GNUC__) - #define NOINLINE __attribute__ ((noinline)) - #elif defined(_MSC_VER) - #define NOINLINE __declspec(noinline) - #else - #define NOINLINE - #endif -#endif - -#ifdef __cplusplus -extern "C" { -#ifndef FORCEINLINE - #define FORCEINLINE inline -#endif -#endif /* __cplusplus */ -#ifndef FORCEINLINE - #define FORCEINLINE -#endif - -#if !ONLY_MSPACES - -/* ------------------- Declarations of public routines ------------------- */ - -#ifndef USE_DL_PREFIX -#define dlcalloc calloc -#define dlfree free -#define dlmalloc malloc -#define dlmemalign memalign -#define dlposix_memalign posix_memalign -#define dlrealloc realloc -#define dlrealloc_in_place realloc_in_place -#define dlvalloc valloc -#define dlpvalloc pvalloc -#define dlmallinfo mallinfo -#define dlmallopt mallopt -#define dlmalloc_trim malloc_trim -#define dlmalloc_stats malloc_stats -#define dlmalloc_usable_size malloc_usable_size -#define dlmalloc_footprint malloc_footprint -#define dlmalloc_max_footprint malloc_max_footprint -#define dlmalloc_footprint_limit malloc_footprint_limit -#define dlmalloc_set_footprint_limit malloc_set_footprint_limit -#define dlmalloc_inspect_all malloc_inspect_all -#define dlindependent_calloc independent_calloc -#define dlindependent_comalloc independent_comalloc -#define dlbulk_free bulk_free -#endif /* USE_DL_PREFIX */ - -/* - malloc(size_t n) - Returns a pointer to a newly allocated chunk of at least n bytes, or - null if no space is available, in which case errno is set to ENOMEM - on ANSI C systems. - - If n is zero, malloc returns a minimum-sized chunk. (The minimum - size is 16 bytes on most 32bit systems, and 32 bytes on 64bit - systems.) Note that size_t is an unsigned type, so calls with - arguments that would be negative if signed are interpreted as - requests for huge amounts of space, which will often fail. The - maximum supported value of n differs across systems, but is in all - cases less than the maximum representable value of a size_t. -*/ -DLMALLOC_EXPORT void* dlmalloc(size_t); - -/* - free(void* p) - Releases the chunk of memory pointed to by p, that had been previously - allocated using malloc or a related routine such as realloc. - It has no effect if p is null. If p was not malloced or already - freed, free(p) will by default cause the current program to abort. -*/ -DLMALLOC_EXPORT void dlfree(void*); - -/* - calloc(size_t n_elements, size_t element_size); - Returns a pointer to n_elements * element_size bytes, with all locations - set to zero. -*/ -DLMALLOC_EXPORT void* dlcalloc(size_t, size_t); - -/* - realloc(void* p, size_t n) - Returns a pointer to a chunk of size n that contains the same data - as does chunk p up to the minimum of (n, p's size) bytes, or null - if no space is available. - - The returned pointer may or may not be the same as p. The algorithm - prefers extending p in most cases when possible, otherwise it - employs the equivalent of a malloc-copy-free sequence. - - If p is null, realloc is equivalent to malloc. - - If space is not available, realloc returns null, errno is set (if on - ANSI) and p is NOT freed. - - if n is for fewer bytes than already held by p, the newly unused - space is lopped off and freed if possible. realloc with a size - argument of zero (re)allocates a minimum-sized chunk. - - The old unix realloc convention of allowing the last-free'd chunk - to be used as an argument to realloc is not supported. -*/ -DLMALLOC_EXPORT void* dlrealloc(void*, size_t); - -/* - realloc_in_place(void* p, size_t n) - Resizes the space allocated for p to size n, only if this can be - done without moving p (i.e., only if there is adjacent space - available if n is greater than p's current allocated size, or n is - less than or equal to p's size). This may be used instead of plain - realloc if an alternative allocation strategy is needed upon failure - to expand space; for example, reallocation of a buffer that must be - memory-aligned or cleared. You can use realloc_in_place to trigger - these alternatives only when needed. - - Returns p if successful; otherwise null. -*/ -DLMALLOC_EXPORT void* dlrealloc_in_place(void*, size_t); - -/* - memalign(size_t alignment, size_t n); - Returns a pointer to a newly allocated chunk of n bytes, aligned - in accord with the alignment argument. - - The alignment argument should be a power of two. If the argument is - not a power of two, the nearest greater power is used. - 8-byte alignment is guaranteed by normal malloc calls, so don't - bother calling memalign with an argument of 8 or less. - - Overreliance on memalign is a sure way to fragment space. -*/ -DLMALLOC_EXPORT void* dlmemalign(size_t, size_t); - -/* - int posix_memalign(void** pp, size_t alignment, size_t n); - Allocates a chunk of n bytes, aligned in accord with the alignment - argument. Differs from memalign only in that it (1) assigns the - allocated memory to *pp rather than returning it, (2) fails and - returns EINVAL if the alignment is not a power of two (3) fails and - returns ENOMEM if memory cannot be allocated. -*/ -DLMALLOC_EXPORT int dlposix_memalign(void**, size_t, size_t); - -/* - valloc(size_t n); - Equivalent to memalign(pagesize, n), where pagesize is the page - size of the system. If the pagesize is unknown, 4096 is used. -*/ -DLMALLOC_EXPORT void* dlvalloc(size_t); - -/* - mallopt(int parameter_number, int parameter_value) - Sets tunable parameters The format is to provide a - (parameter-number, parameter-value) pair. mallopt then sets the - corresponding parameter to the argument value if it can (i.e., so - long as the value is meaningful), and returns 1 if successful else - 0. To workaround the fact that mallopt is specified to use int, - not size_t parameters, the value -1 is specially treated as the - maximum unsigned size_t value. - - SVID/XPG/ANSI defines four standard param numbers for mallopt, - normally defined in malloc.h. None of these are use in this malloc, - so setting them has no effect. But this malloc also supports other - options in mallopt. See below for details. Briefly, supported - parameters are as follows (listed defaults are for "typical" - configurations). - - Symbol param # default allowed param values - M_TRIM_THRESHOLD -1 2*1024*1024 any (-1 disables) - M_GRANULARITY -2 page size any power of 2 >= page size - M_MMAP_THRESHOLD -3 256*1024 any (or 0 if no MMAP support) -*/ -DLMALLOC_EXPORT int dlmallopt(int, int); - -/* - malloc_footprint(); - Returns the number of bytes obtained from the system. The total - number of bytes allocated by malloc, realloc etc., is less than this - value. Unlike mallinfo, this function returns only a precomputed - result, so can be called frequently to monitor memory consumption. - Even if locks are otherwise defined, this function does not use them, - so results might not be up to date. -*/ -DLMALLOC_EXPORT size_t dlmalloc_footprint(void); - -/* - malloc_max_footprint(); - Returns the maximum number of bytes obtained from the system. This - value will be greater than current footprint if deallocated space - has been reclaimed by the system. The peak number of bytes allocated - by malloc, realloc etc., is less than this value. Unlike mallinfo, - this function returns only a precomputed result, so can be called - frequently to monitor memory consumption. Even if locks are - otherwise defined, this function does not use them, so results might - not be up to date. -*/ -DLMALLOC_EXPORT size_t dlmalloc_max_footprint(void); - -/* - malloc_footprint_limit(); - Returns the number of bytes that the heap is allowed to obtain from - the system, returning the last value returned by - malloc_set_footprint_limit, or the maximum size_t value if - never set. The returned value reflects a permission. There is no - guarantee that this number of bytes can actually be obtained from - the system. -*/ -DLMALLOC_EXPORT size_t dlmalloc_footprint_limit(); - -/* - malloc_set_footprint_limit(); - Sets the maximum number of bytes to obtain from the system, causing - failure returns from malloc and related functions upon attempts to - exceed this value. The argument value may be subject to page - rounding to an enforceable limit; this actual value is returned. - Using an argument of the maximum possible size_t effectively - disables checks. If the argument is less than or equal to the - current malloc_footprint, then all future allocations that require - additional system memory will fail. However, invocation cannot - retroactively deallocate existing used memory. -*/ -DLMALLOC_EXPORT size_t dlmalloc_set_footprint_limit(size_t bytes); - -#if MALLOC_INSPECT_ALL -/* - malloc_inspect_all(void(*handler)(void *start, - void *end, - size_t used_bytes, - void* callback_arg), - void* arg); - Traverses the heap and calls the given handler for each managed - region, skipping all bytes that are (or may be) used for bookkeeping - purposes. Traversal does not include include chunks that have been - directly memory mapped. Each reported region begins at the start - address, and continues up to but not including the end address. The - first used_bytes of the region contain allocated data. If - used_bytes is zero, the region is unallocated. The handler is - invoked with the given callback argument. If locks are defined, they - are held during the entire traversal. It is a bad idea to invoke - other malloc functions from within the handler. - - For example, to count the number of in-use chunks with size greater - than 1000, you could write: - static int count = 0; - void count_chunks(void* start, void* end, size_t used, void* arg) { - if (used >= 1000) ++count; - } - then: - malloc_inspect_all(count_chunks, NULL); - - malloc_inspect_all is compiled only if MALLOC_INSPECT_ALL is defined. -*/ -DLMALLOC_EXPORT void dlmalloc_inspect_all(void(*handler)(void*, void *, size_t, void*), - void* arg); - -#endif /* MALLOC_INSPECT_ALL */ - -#if !NO_MALLINFO -/* - mallinfo() - Returns (by copy) a struct containing various summary statistics: - - arena: current total non-mmapped bytes allocated from system - ordblks: the number of free chunks - smblks: always zero. - hblks: current number of mmapped regions - hblkhd: total bytes held in mmapped regions - usmblks: the maximum total allocated space. This will be greater - than current total if trimming has occurred. - fsmblks: always zero - uordblks: current total allocated space (normal or mmapped) - fordblks: total free space - keepcost: the maximum number of bytes that could ideally be released - back to system via malloc_trim. ("ideally" means that - it ignores page restrictions etc.) - - Because these fields are ints, but internal bookkeeping may - be kept as longs, the reported values may wrap around zero and - thus be inaccurate. -*/ -DLMALLOC_EXPORT struct mallinfo dlmallinfo(void); -#endif /* NO_MALLINFO */ - -/* - independent_calloc(size_t n_elements, size_t element_size, void* chunks[]); - - independent_calloc is similar to calloc, but instead of returning a - single cleared space, it returns an array of pointers to n_elements - independent elements that can hold contents of size elem_size, each - of which starts out cleared, and can be independently freed, - realloc'ed etc. The elements are guaranteed to be adjacently - allocated (this is not guaranteed to occur with multiple callocs or - mallocs), which may also improve cache locality in some - applications. - - The "chunks" argument is optional (i.e., may be null, which is - probably the most typical usage). If it is null, the returned array - is itself dynamically allocated and should also be freed when it is - no longer needed. Otherwise, the chunks array must be of at least - n_elements in length. It is filled in with the pointers to the - chunks. - - In either case, independent_calloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and "chunks" - is null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be freed when it is no longer needed. This can be - done all at once using bulk_free. - - independent_calloc simplifies and speeds up implementations of many - kinds of pools. It may also be useful when constructing large data - structures that initially have a fixed number of fixed-sized nodes, - but the number is not known at compile time, and some of the nodes - may later need to be freed. For example: - - struct Node { int item; struct Node* next; }; - - struct Node* build_list() { - struct Node** pool; - int n = read_number_of_nodes_needed(); - if (n <= 0) return 0; - pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0); - if (pool == 0) die(); - // organize into a linked list... - struct Node* first = pool[0]; - for (i = 0; i < n-1; ++i) - pool[i]->next = pool[i+1]; - free(pool); // Can now free the array (or not, if it is needed later) - return first; - } -*/ -DLMALLOC_EXPORT void** dlindependent_calloc(size_t, size_t, void**); - -/* - independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]); - - independent_comalloc allocates, all at once, a set of n_elements - chunks with sizes indicated in the "sizes" array. It returns - an array of pointers to these elements, each of which can be - independently freed, realloc'ed etc. The elements are guaranteed to - be adjacently allocated (this is not guaranteed to occur with - multiple callocs or mallocs), which may also improve cache locality - in some applications. - - The "chunks" argument is optional (i.e., may be null). If it is null - the returned array is itself dynamically allocated and should also - be freed when it is no longer needed. Otherwise, the chunks array - must be of at least n_elements in length. It is filled in with the - pointers to the chunks. - - In either case, independent_comalloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and chunks is - null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be freed when it is no longer needed. This can be - done all at once using bulk_free. - - independent_comallac differs from independent_calloc in that each - element may have a different size, and also that it does not - automatically clear elements. - - independent_comalloc can be used to speed up allocation in cases - where several structs or objects must always be allocated at the - same time. For example: - - struct Head { ... } - struct Foot { ... } - - void send_message(char* msg) { - int msglen = strlen(msg); - size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) }; - void* chunks[3]; - if (independent_comalloc(3, sizes, chunks) == 0) - die(); - struct Head* head = (struct Head*)(chunks[0]); - char* body = (char*)(chunks[1]); - struct Foot* foot = (struct Foot*)(chunks[2]); - // ... - } - - In general though, independent_comalloc is worth using only for - larger values of n_elements. For small values, you probably won't - detect enough difference from series of malloc calls to bother. - - Overuse of independent_comalloc can increase overall memory usage, - since it cannot reuse existing noncontiguous small chunks that - might be available for some of the elements. -*/ -DLMALLOC_EXPORT void** dlindependent_comalloc(size_t, size_t*, void**); - -/* - bulk_free(void* array[], size_t n_elements) - Frees and clears (sets to null) each non-null pointer in the given - array. This is likely to be faster than freeing them one-by-one. - If footers are used, pointers that have been allocated in different - mspaces are not freed or cleared, and the count of all such pointers - is returned. For large arrays of pointers with poor locality, it - may be worthwhile to sort this array before calling bulk_free. -*/ -DLMALLOC_EXPORT size_t dlbulk_free(void**, size_t n_elements); - -/* - pvalloc(size_t n); - Equivalent to valloc(minimum-page-that-holds(n)), that is, - round up n to nearest pagesize. - */ -DLMALLOC_EXPORT void* dlpvalloc(size_t); - -/* - malloc_trim(size_t pad); - - If possible, gives memory back to the system (via negative arguments - to sbrk) if there is unused memory at the `high' end of the malloc - pool or in unused MMAP segments. You can call this after freeing - large blocks of memory to potentially reduce the system-level memory - requirements of a program. However, it cannot guarantee to reduce - memory. Under some allocation patterns, some large free blocks of - memory will be locked between two used chunks, so they cannot be - given back to the system. - - The `pad' argument to malloc_trim represents the amount of free - trailing space to leave untrimmed. If this argument is zero, only - the minimum amount of memory to maintain internal data structures - will be left. Non-zero arguments can be supplied to maintain enough - trailing space to service future expected allocations without having - to re-obtain memory from the system. - - Malloc_trim returns 1 if it actually released any memory, else 0. -*/ -DLMALLOC_EXPORT int dlmalloc_trim(size_t); - -/* - malloc_stats(); - Prints on stderr the amount of space obtained from the system (both - via sbrk and mmap), the maximum amount (which may be more than - current if malloc_trim and/or munmap got called), and the current - number of bytes allocated via malloc (or realloc, etc) but not yet - freed. Note that this is the number of bytes allocated, not the - number requested. It will be larger than the number requested - because of alignment and bookkeeping overhead. Because it includes - alignment wastage as being in use, this figure may be greater than - zero even when no user-level chunks are allocated. - - The reported current and maximum system memory can be inaccurate if - a program makes other calls to system memory allocation functions - (normally sbrk) outside of malloc. - - malloc_stats prints only the most commonly interesting statistics. - More information can be obtained by calling mallinfo. -*/ -DLMALLOC_EXPORT void dlmalloc_stats(void); - -/* - malloc_usable_size(void* p); - - Returns the number of bytes you can actually use in - an allocated chunk, which may be more than you requested (although - often not) due to alignment and minimum size constraints. - You can use this many bytes without worrying about - overwriting other allocated objects. This is not a particularly great - programming practice. malloc_usable_size can be more useful in - debugging and assertions, for example: - - p = malloc(n); - assert(malloc_usable_size(p) >= 256); -*/ -size_t dlmalloc_usable_size(void*); - -#endif /* ONLY_MSPACES */ - -#if MSPACES - -/* - mspace is an opaque type representing an independent - region of space that supports mspace_malloc, etc. -*/ -typedef void* mspace; - -/* - create_mspace creates and returns a new independent space with the - given initial capacity, or, if 0, the default granularity size. It - returns null if there is no system memory available to create the - space. If argument locked is non-zero, the space uses a separate - lock to control access. The capacity of the space will grow - dynamically as needed to service mspace_malloc requests. You can - control the sizes of incremental increases of this space by - compiling with a different DEFAULT_GRANULARITY or dynamically - setting with mallopt(M_GRANULARITY, value). -*/ -DLMALLOC_EXPORT mspace create_mspace(size_t capacity, int locked); - -/* - destroy_mspace destroys the given space, and attempts to return all - of its memory back to the system, returning the total number of - bytes freed. After destruction, the results of access to all memory - used by the space become undefined. -*/ -DLMALLOC_EXPORT size_t destroy_mspace(mspace msp); - -/* - create_mspace_with_base uses the memory supplied as the initial base - of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this - space is used for bookkeeping, so the capacity must be at least this - large. (Otherwise 0 is returned.) When this initial space is - exhausted, additional memory will be obtained from the system. - Destroying this space will deallocate all additionally allocated - space (if possible) but not the initial base. -*/ -DLMALLOC_EXPORT mspace create_mspace_with_base(void* base, size_t capacity, int locked); -DLMALLOC_EXPORT mspace create_device_mspace_with_base(void* base, size_t capacity, int locked); - -/* - mspace_track_large_chunks controls whether requests for large chunks - are allocated in their own untracked mmapped regions, separate from - others in this mspace. By default large chunks are not tracked, - which reduces fragmentation. However, such chunks are not - necessarily released to the system upon destroy_mspace. Enabling - tracking by setting to true may increase fragmentation, but avoids - leakage when relying on destroy_mspace to release all memory - allocated using this space. The function returns the previous - setting. -*/ -DLMALLOC_EXPORT int mspace_track_large_chunks(mspace msp, int enable); - - -/* - mspace_malloc behaves as malloc, but operates within - the given space. -*/ -DLMALLOC_EXPORT void* mspace_malloc(mspace msp, size_t bytes); - -/* - mspace_free behaves as free, but operates within - the given space. - - If compiled with FOOTERS==1, mspace_free is not actually needed. - free may be called instead of mspace_free because freed chunks from - any space are handled by their originating spaces. -*/ -DLMALLOC_EXPORT void mspace_free(mspace msp, void* mem); - -/* - mspace_realloc behaves as realloc, but operates within - the given space. - - If compiled with FOOTERS==1, mspace_realloc is not actually - needed. realloc may be called instead of mspace_realloc because - realloced chunks from any space are handled by their originating - spaces. -*/ -DLMALLOC_EXPORT void* mspace_realloc(mspace msp, void* mem, size_t newsize); - -/* - mspace_calloc behaves as calloc, but operates within - the given space. -*/ -DLMALLOC_EXPORT void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size); - -/* - mspace_memalign behaves as memalign, but operates within - the given space. -*/ -DLMALLOC_EXPORT void* mspace_memalign(mspace msp, size_t alignment, size_t bytes); - -/* - mspace_independent_calloc behaves as independent_calloc, but - operates within the given space. -*/ -DLMALLOC_EXPORT void** mspace_independent_calloc(mspace msp, size_t n_elements, - size_t elem_size, void* chunks[]); - -/* - mspace_independent_comalloc behaves as independent_comalloc, but - operates within the given space. -*/ -DLMALLOC_EXPORT void** mspace_independent_comalloc(mspace msp, size_t n_elements, - size_t sizes[], void* chunks[]); - -/* - mspace_footprint() returns the number of bytes obtained from the - system for this space. -*/ -DLMALLOC_EXPORT size_t mspace_footprint(mspace msp); - -/* - mspace_max_footprint() returns the peak number of bytes obtained from the - system for this space. -*/ -DLMALLOC_EXPORT size_t mspace_max_footprint(mspace msp); - - -#if !NO_MALLINFO -/* - mspace_mallinfo behaves as mallinfo, but reports properties of - the given space. -*/ -DLMALLOC_EXPORT struct mallinfo mspace_mallinfo(mspace msp); -#endif /* NO_MALLINFO */ - -/* - malloc_usable_size(void* p) behaves the same as malloc_usable_size; -*/ -DLMALLOC_EXPORT size_t mspace_usable_size(const void* mem); - -/* - mspace_malloc_stats behaves as malloc_stats, but reports - properties of the given space. -*/ -DLMALLOC_EXPORT void mspace_malloc_stats(mspace msp); - -/* - mspace_trim behaves as malloc_trim, but - operates within the given space. -*/ -DLMALLOC_EXPORT int mspace_trim(mspace msp, size_t pad); - -/* - An alias for mallopt. -*/ -DLMALLOC_EXPORT int mspace_mallopt(int, int); - -#endif /* MSPACES */ - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif /* __cplusplus */ - -/* - ======================================================================== - To make a fully customizable malloc.h header file, cut everything - above this line, put into file malloc.h, edit to suit, and #include it - on the next line, as well as in programs that use this malloc. - ======================================================================== -*/ - -/* #include "malloc.h" */ - -/*------------------------------ internal #includes ---------------------- */ - -#ifdef _MSC_VER -#pragma warning( disable : 4146 ) /* no "unsigned" warnings */ -#endif /* _MSC_VER */ -#if !NO_MALLOC_STATS -#include /* for printing in malloc_stats */ -#endif /* NO_MALLOC_STATS */ -#ifndef LACKS_ERRNO_H -#include /* for MALLOC_FAILURE_ACTION */ -#endif /* LACKS_ERRNO_H */ -#ifdef DEBUG -#if ABORT_ON_ASSERT_FAILURE -#undef assert -#define assert(x) if(!(x)) ABORT -#else /* ABORT_ON_ASSERT_FAILURE */ -#include -#endif /* ABORT_ON_ASSERT_FAILURE */ -#else /* DEBUG */ -#ifndef assert -#define assert(x) -#endif -#define DEBUG 0 -#endif /* DEBUG */ -#if !defined(WIN32) && !defined(LACKS_TIME_H) -#include /* for magic initialization */ -#endif /* WIN32 */ -#ifndef LACKS_STDLIB_H -#include /* for abort() */ -#endif /* LACKS_STDLIB_H */ -#ifndef LACKS_STRING_H -#include /* for memset etc */ -#endif /* LACKS_STRING_H */ -#if USE_BUILTIN_FFS -#ifndef LACKS_STRINGS_H -#include /* for ffs */ -#endif /* LACKS_STRINGS_H */ -#endif /* USE_BUILTIN_FFS */ -#if HAVE_MMAP -#ifndef LACKS_SYS_MMAN_H -/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */ -#if (defined(linux) && !defined(__USE_GNU)) -#define __USE_GNU 1 -#include /* for mmap */ -#undef __USE_GNU -#else -#include /* for mmap */ -#endif /* linux */ -#endif /* LACKS_SYS_MMAN_H */ -#ifndef LACKS_FCNTL_H -#include -#endif /* LACKS_FCNTL_H */ -#endif /* HAVE_MMAP */ -#ifndef LACKS_UNISTD_H -#include /* for sbrk, sysconf */ -#else /* LACKS_UNISTD_H */ -#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) -extern void* sbrk(ptrdiff_t); -#endif /* FreeBSD etc */ -#endif /* LACKS_UNISTD_H */ - -/* Declarations for locking */ -#if USE_LOCKS -#ifndef WIN32 -#if defined (__SVR4) && defined (__sun) /* solaris */ -#include -#elif !defined(LACKS_SCHED_H) -#include -#endif /* solaris or LACKS_SCHED_H */ -#if (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0) || !USE_SPIN_LOCKS -#include -#endif /* USE_RECURSIVE_LOCKS ... */ -#elif defined(_MSC_VER) -#ifndef _M_AMD64 -/* These are already defined on AMD64 builds */ -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ -LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp); -LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value); -#ifdef __cplusplus -} -#endif /* __cplusplus */ -#endif /* _M_AMD64 */ -#pragma intrinsic (_InterlockedCompareExchange) -#pragma intrinsic (_InterlockedExchange) -#define interlockedcompareexchange _InterlockedCompareExchange -#define interlockedexchange _InterlockedExchange -#elif defined(WIN32) && defined(__GNUC__) -#define interlockedcompareexchange(a, b, c) __sync_val_compare_and_swap(a, c, b) -#define interlockedexchange __sync_lock_test_and_set -#endif /* Win32 */ -#else /* USE_LOCKS */ -#endif /* USE_LOCKS */ - -#ifndef LOCK_AT_FORK -#define LOCK_AT_FORK 0 -#endif - -/* Declarations for bit scanning on win32 */ -#if defined(_MSC_VER) && _MSC_VER>=1300 -#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */ -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ -unsigned char _BitScanForward(unsigned long *index, unsigned long mask); -unsigned char _BitScanReverse(unsigned long *index, unsigned long mask); -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#define BitScanForward _BitScanForward -#define BitScanReverse _BitScanReverse -#pragma intrinsic(_BitScanForward) -#pragma intrinsic(_BitScanReverse) -#endif /* BitScanForward */ -#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */ - -#ifndef WIN32 -#ifndef malloc_getpagesize -# ifdef _SC_PAGESIZE /* some SVR4 systems omit an underscore */ -# ifndef _SC_PAGE_SIZE -# define _SC_PAGE_SIZE _SC_PAGESIZE -# endif -# endif -# ifdef _SC_PAGE_SIZE -# define malloc_getpagesize sysconf(_SC_PAGE_SIZE) -# else -# if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE) - extern size_t getpagesize(); -# define malloc_getpagesize getpagesize() -# else -# ifdef WIN32 /* use supplied emulation of getpagesize */ -# define malloc_getpagesize getpagesize() -# else -# ifndef LACKS_SYS_PARAM_H -# include -# endif -# ifdef EXEC_PAGESIZE -# define malloc_getpagesize EXEC_PAGESIZE -# else -# ifdef NBPG -# ifndef CLSIZE -# define malloc_getpagesize NBPG -# else -# define malloc_getpagesize (NBPG * CLSIZE) -# endif -# else -# ifdef NBPC -# define malloc_getpagesize NBPC -# else -# ifdef PAGESIZE -# define malloc_getpagesize PAGESIZE -# else /* just guess */ -# define malloc_getpagesize ((size_t)4096U) -# endif -# endif -# endif -# endif -# endif -# endif -# endif -#endif -#endif - -/* ------------------- size_t and alignment properties -------------------- */ - -/* The byte and bit size of a size_t */ -#define SIZE_T_SIZE (sizeof(size_t)) -#define SIZE_T_BITSIZE (sizeof(size_t) << 3) - -/* Some constants coerced to size_t */ -/* Annoying but necessary to avoid errors on some platforms */ -#define SIZE_T_ZERO ((size_t)0) -#define SIZE_T_ONE ((size_t)1) -#define SIZE_T_TWO ((size_t)2) -#define SIZE_T_FOUR ((size_t)4) -#define TWO_SIZE_T_SIZES (SIZE_T_SIZE<<1) -#define FOUR_SIZE_T_SIZES (SIZE_T_SIZE<<2) -#define SIX_SIZE_T_SIZES (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES) -#define HALF_MAX_SIZE_T (MAX_SIZE_T / 2U) - -/* The bit mask value corresponding to MALLOC_ALIGNMENT */ -#define CHUNK_ALIGN_MASK (MALLOC_ALIGNMENT - SIZE_T_ONE) - -/* True if address a has acceptable alignment */ -#define is_aligned(A) (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0) - -/* the number of bytes to offset an address to align it */ -#define align_offset(A)\ - ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\ - ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK)) - -/* -------------------------- MMAP preliminaries ------------------------- */ - -/* - If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and - checks to fail so compiler optimizer can delete code rather than - using so many "#if"s. -*/ - - -/* MORECORE and MMAP must return MFAIL on failure */ -#define MFAIL ((void*)(MAX_SIZE_T)) -#define CMFAIL ((char*)(MFAIL)) /* defined for convenience */ - -#if HAVE_MMAP - -#ifndef WIN32 -#define MUNMAP_DEFAULT(a, s) munmap((a), (s)) -#define MMAP_PROT (PROT_READ|PROT_WRITE) -#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) -#define MAP_ANONYMOUS MAP_ANON -#endif /* MAP_ANON */ -#ifdef MAP_ANONYMOUS -#define MMAP_FLAGS (MAP_PRIVATE|MAP_ANONYMOUS) -#define MMAP_DEFAULT(s) mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0) -#else /* MAP_ANONYMOUS */ -/* - Nearly all versions of mmap support MAP_ANONYMOUS, so the following - is unlikely to be needed, but is supplied just in case. -*/ -#define MMAP_FLAGS (MAP_PRIVATE) -static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */ -#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \ - (dev_zero_fd = open("/dev/zero", O_RDWR), \ - mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \ - mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) -#endif /* MAP_ANONYMOUS */ - -#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s) - -#else /* WIN32 */ - -/* Win32 MMAP via VirtualAlloc */ -static FORCEINLINE void* win32mmap(size_t size) { - void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); - return (ptr != 0)? ptr: MFAIL; -} - -/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */ -static FORCEINLINE void* win32direct_mmap(size_t size) { - void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN, - PAGE_READWRITE); - return (ptr != 0)? ptr: MFAIL; -} - -/* This function supports releasing coalesed segments */ -static FORCEINLINE int win32munmap(void* ptr, size_t size) { - MEMORY_BASIC_INFORMATION minfo; - char* cptr = (char*)ptr; - while (size) { - if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0) - return -1; - if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr || - minfo.State != MEM_COMMIT || minfo.RegionSize > size) - return -1; - if (VirtualFree(cptr, 0, MEM_RELEASE) == 0) - return -1; - cptr += minfo.RegionSize; - size -= minfo.RegionSize; - } - return 0; -} - -#define MMAP_DEFAULT(s) win32mmap(s) -#define MUNMAP_DEFAULT(a, s) win32munmap((a), (s)) -#define DIRECT_MMAP_DEFAULT(s) win32direct_mmap(s) -#endif /* WIN32 */ -#endif /* HAVE_MMAP */ - -#if HAVE_MREMAP -#ifndef WIN32 -#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv)) -#endif /* WIN32 */ -#endif /* HAVE_MREMAP */ - -/** - * Define CALL_MORECORE - */ -#if HAVE_MORECORE - #ifdef MORECORE - #define CALL_MORECORE(S) MORECORE(S) - #else /* MORECORE */ - #define CALL_MORECORE(S) MORECORE_DEFAULT(S) - #endif /* MORECORE */ -#else /* HAVE_MORECORE */ - #define CALL_MORECORE(S) MFAIL -#endif /* HAVE_MORECORE */ - -/** - * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP - */ -#if HAVE_MMAP - #define USE_MMAP_BIT (SIZE_T_ONE) - - #ifdef MMAP - #define CALL_MMAP(s) MMAP(s) - #else /* MMAP */ - #define CALL_MMAP(s) MMAP_DEFAULT(s) - #endif /* MMAP */ - #ifdef MUNMAP - #define CALL_MUNMAP(a, s) MUNMAP((a), (s)) - #else /* MUNMAP */ - #define CALL_MUNMAP(a, s) MUNMAP_DEFAULT((a), (s)) - #endif /* MUNMAP */ - #ifdef DIRECT_MMAP - #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s) - #else /* DIRECT_MMAP */ - #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s) - #endif /* DIRECT_MMAP */ -#else /* HAVE_MMAP */ - #define USE_MMAP_BIT (SIZE_T_ZERO) - - #define MMAP(s) MFAIL - #define MUNMAP(a, s) (-1) - #define DIRECT_MMAP(s) MFAIL - #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s) - #define CALL_MMAP(s) MMAP(s) - #define CALL_MUNMAP(a, s) MUNMAP((a), (s)) -#endif /* HAVE_MMAP */ - -/** - * Define CALL_MREMAP - */ -#if HAVE_MMAP && HAVE_MREMAP - #ifdef MREMAP - #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv)) - #else /* MREMAP */ - #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv)) - #endif /* MREMAP */ -#else /* HAVE_MMAP && HAVE_MREMAP */ - #define CALL_MREMAP(addr, osz, nsz, mv) MFAIL -#endif /* HAVE_MMAP && HAVE_MREMAP */ - -/* mstate bit set if continguous morecore disabled or failed */ -#define USE_NONCONTIGUOUS_BIT (4U) - -/* segment bit set in create_mspace_with_base */ -#define EXTERN_BIT (8U) - - -/* --------------------------- Lock preliminaries ------------------------ */ - -/* - When locks are defined, there is one global lock, plus - one per-mspace lock. - - The global lock_ensures that mparams.magic and other unique - mparams values are initialized only once. It also protects - sequences of calls to MORECORE. In many cases sys_alloc requires - two calls, that should not be interleaved with calls by other - threads. This does not protect against direct calls to MORECORE - by other threads not using this lock, so there is still code to - cope the best we can on interference. - - Per-mspace locks surround calls to malloc, free, etc. - By default, locks are simple non-reentrant mutexes. - - Because lock-protected regions generally have bounded times, it is - OK to use the supplied simple spinlocks. Spinlocks are likely to - improve performance for lightly contended applications, but worsen - performance under heavy contention. - - If USE_LOCKS is > 1, the definitions of lock routines here are - bypassed, in which case you will need to define the type MLOCK_T, - and at least INITIAL_LOCK, DESTROY_LOCK, ACQUIRE_LOCK, RELEASE_LOCK - and TRY_LOCK. You must also declare a - static MLOCK_T malloc_global_mutex = { initialization values };. - -*/ - -#if !USE_LOCKS -#define USE_LOCK_BIT (0U) -#define INITIAL_LOCK(l) (0) -#define DESTROY_LOCK(l) (0) -#define ACQUIRE_MALLOC_GLOBAL_LOCK() -#define RELEASE_MALLOC_GLOBAL_LOCK() - -#else -#if USE_LOCKS > 1 -/* ----------------------- User-defined locks ------------------------ */ -/* Define your own lock implementation here */ -/* #define INITIAL_LOCK(lk) ... */ -/* #define DESTROY_LOCK(lk) ... */ -/* #define ACQUIRE_LOCK(lk) ... */ -/* #define RELEASE_LOCK(lk) ... */ -/* #define TRY_LOCK(lk) ... */ -/* static MLOCK_T malloc_global_mutex = ... */ - -#elif USE_SPIN_LOCKS - -/* First, define CAS_LOCK and CLEAR_LOCK on ints */ -/* Note CAS_LOCK defined to return 0 on success */ - -#if defined(__GNUC__)&& (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define CAS_LOCK(sl) __sync_lock_test_and_set(sl, 1) -#define CLEAR_LOCK(sl) __sync_lock_release(sl) - -#elif (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) -/* Custom spin locks for older gcc on x86 */ -static FORCEINLINE int x86_cas_lock(int *sl) { - int ret; - int val = 1; - int cmp = 0; - __asm__ __volatile__ ("lock; cmpxchgl %1, %2" - : "=a" (ret) - : "r" (val), "m" (*(sl)), "0"(cmp) - : "memory", "cc"); - return ret; -} - -static FORCEINLINE void x86_clear_lock(int* sl) { - assert(*sl != 0); - int prev = 0; - int ret; - __asm__ __volatile__ ("lock; xchgl %0, %1" - : "=r" (ret) - : "m" (*(sl)), "0"(prev) - : "memory"); -} - -#define CAS_LOCK(sl) x86_cas_lock(sl) -#define CLEAR_LOCK(sl) x86_clear_lock(sl) - -#else /* Win32 MSC */ -#define CAS_LOCK(sl) interlockedexchange(sl, (LONG)1) -#define CLEAR_LOCK(sl) interlockedexchange (sl, (LONG)0) - -#endif /* ... gcc spins locks ... */ - -/* How to yield for a spin lock */ -#define SPINS_PER_YIELD 63 -#if defined(_MSC_VER) -#define SLEEP_EX_DURATION 50 /* delay for yield/sleep */ -#define SPIN_LOCK_YIELD SleepEx(SLEEP_EX_DURATION, FALSE) -#elif defined (__SVR4) && defined (__sun) /* solaris */ -#define SPIN_LOCK_YIELD thr_yield(); -#elif !defined(LACKS_SCHED_H) -#define SPIN_LOCK_YIELD sched_yield(); -#else -#define SPIN_LOCK_YIELD -#endif /* ... yield ... */ - -#if !defined(USE_RECURSIVE_LOCKS) || USE_RECURSIVE_LOCKS == 0 -/* Plain spin locks use single word (embedded in malloc_states) */ -static int spin_acquire_lock(int *sl) { - int spins = 0; - while (*(volatile int *)sl != 0 || CAS_LOCK(sl)) { - if ((++spins & SPINS_PER_YIELD) == 0) { - SPIN_LOCK_YIELD; - } - } - return 0; -} - -#define MLOCK_T int -#define TRY_LOCK(sl) !CAS_LOCK(sl) -#define RELEASE_LOCK(sl) CLEAR_LOCK(sl) -#define ACQUIRE_LOCK(sl) (CAS_LOCK(sl)? spin_acquire_lock(sl) : 0) -#define INITIAL_LOCK(sl) (*sl = 0) -#define DESTROY_LOCK(sl) (0) -static MLOCK_T malloc_global_mutex = 0; - -#else /* USE_RECURSIVE_LOCKS */ -/* types for lock owners */ -#ifdef WIN32 -#define THREAD_ID_T DWORD -#define CURRENT_THREAD GetCurrentThreadId() -#define EQ_OWNER(X,Y) ((X) == (Y)) -#else -/* - Note: the following assume that pthread_t is a type that can be - initialized to (casted) zero. If this is not the case, you will need to - somehow redefine these or not use spin locks. -*/ -#define THREAD_ID_T pthread_t -#define CURRENT_THREAD pthread_self() -#define EQ_OWNER(X,Y) pthread_equal(X, Y) -#endif - -struct malloc_recursive_lock { - int sl; - unsigned int c; - THREAD_ID_T threadid; -}; - -#define MLOCK_T struct malloc_recursive_lock -static MLOCK_T malloc_global_mutex = { 0, 0, (THREAD_ID_T)0}; - -static FORCEINLINE void recursive_release_lock(MLOCK_T *lk) { - assert(lk->sl != 0); - if (--lk->c == 0) { - CLEAR_LOCK(&lk->sl); - } -} - -static FORCEINLINE int recursive_acquire_lock(MLOCK_T *lk) { - THREAD_ID_T mythreadid = CURRENT_THREAD; - int spins = 0; - for (;;) { - if (*((volatile int *)(&lk->sl)) == 0) { - if (!CAS_LOCK(&lk->sl)) { - lk->threadid = mythreadid; - lk->c = 1; - return 0; - } - } - else if (EQ_OWNER(lk->threadid, mythreadid)) { - ++lk->c; - return 0; - } - if ((++spins & SPINS_PER_YIELD) == 0) { - SPIN_LOCK_YIELD; - } - } -} - -static FORCEINLINE int recursive_try_lock(MLOCK_T *lk) { - THREAD_ID_T mythreadid = CURRENT_THREAD; - if (*((volatile int *)(&lk->sl)) == 0) { - if (!CAS_LOCK(&lk->sl)) { - lk->threadid = mythreadid; - lk->c = 1; - return 1; - } - } - else if (EQ_OWNER(lk->threadid, mythreadid)) { - ++lk->c; - return 1; - } - return 0; -} - -#define RELEASE_LOCK(lk) recursive_release_lock(lk) -#define TRY_LOCK(lk) recursive_try_lock(lk) -#define ACQUIRE_LOCK(lk) recursive_acquire_lock(lk) -#define INITIAL_LOCK(lk) ((lk)->threadid = (THREAD_ID_T)0, (lk)->sl = 0, (lk)->c = 0) -#define DESTROY_LOCK(lk) (0) -#endif /* USE_RECURSIVE_LOCKS */ - -#elif defined(WIN32) /* Win32 critical sections */ -#define MLOCK_T CRITICAL_SECTION -#define ACQUIRE_LOCK(lk) (EnterCriticalSection(lk), 0) -#define RELEASE_LOCK(lk) LeaveCriticalSection(lk) -#define TRY_LOCK(lk) TryEnterCriticalSection(lk) -#define INITIAL_LOCK(lk) (!InitializeCriticalSectionAndSpinCount((lk), 0x80000000|4000)) -#define DESTROY_LOCK(lk) (DeleteCriticalSection(lk), 0) -#define NEED_GLOBAL_LOCK_INIT - -static MLOCK_T malloc_global_mutex; -static volatile LONG malloc_global_mutex_status; - -/* Use spin loop to initialize global lock */ -static void init_malloc_global_mutex() { - for (;;) { - long stat = malloc_global_mutex_status; - if (stat > 0) - return; - /* transition to < 0 while initializing, then to > 0) */ - if (stat == 0 && - interlockedcompareexchange(&malloc_global_mutex_status, (LONG)-1, (LONG)0) == 0) { - InitializeCriticalSection(&malloc_global_mutex); - interlockedexchange(&malloc_global_mutex_status, (LONG)1); - return; - } - SleepEx(0, FALSE); - } -} - -#else /* pthreads-based locks */ -#define MLOCK_T pthread_mutex_t -#define ACQUIRE_LOCK(lk) pthread_mutex_lock(lk) -#define RELEASE_LOCK(lk) pthread_mutex_unlock(lk) -#define TRY_LOCK(lk) (!pthread_mutex_trylock(lk)) -#define INITIAL_LOCK(lk) pthread_init_lock(lk) -#define DESTROY_LOCK(lk) pthread_mutex_destroy(lk) - -#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 && defined(linux) && !defined(PTHREAD_MUTEX_RECURSIVE) -/* Cope with old-style linux recursive lock initialization by adding */ -/* skipped internal declaration from pthread.h */ -extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr, - int __kind)); -#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP -#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y) -#endif /* USE_RECURSIVE_LOCKS ... */ - -static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER; - -static int pthread_init_lock (MLOCK_T *lk) { - pthread_mutexattr_t attr; - if (pthread_mutexattr_init(&attr)) return 1; -#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 - if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1; -#endif - if (pthread_mutex_init(lk, &attr)) return 1; - if (pthread_mutexattr_destroy(&attr)) return 1; - return 0; -} - -#endif /* ... lock types ... */ - -/* Common code for all lock types */ -#define USE_LOCK_BIT (2U) - -#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK -#define ACQUIRE_MALLOC_GLOBAL_LOCK() ACQUIRE_LOCK(&malloc_global_mutex); -#endif - -#ifndef RELEASE_MALLOC_GLOBAL_LOCK -#define RELEASE_MALLOC_GLOBAL_LOCK() RELEASE_LOCK(&malloc_global_mutex); -#endif - -#endif /* USE_LOCKS */ - -/* ----------------------- Chunk representations ------------------------ */ - -/* - (The following includes lightly edited explanations by Colin Plumb.) - - The malloc_chunk declaration below is misleading (but accurate and - necessary). It declares a "view" into memory allowing access to - necessary fields at known offsets from a given base. - - Chunks of memory are maintained using a `boundary tag' method as - originally described by Knuth. (See the paper by Paul Wilson - ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such - techniques.) Sizes of free chunks are stored both in the front of - each chunk and at the end. This makes consolidating fragmented - chunks into bigger chunks fast. The head fields also hold bits - representing whether chunks are free or in use. - - Here are some pictures to make it clearer. They are "exploded" to - show that the state of a chunk can be thought of as extending from - the high 31 bits of the head field of its header through the - prev_foot and PINUSE_BIT bit of the following chunk header. - - A chunk that's in use looks like: - - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of previous chunk (if P = 0) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| - | Size of this chunk 1| +-+ - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - +- -+ - | | - +- -+ - | : - +- size - sizeof(size_t) available payload bytes -+ - : | - chunk-> +- -+ - | | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1| - | Size of next chunk (may or may not be in use) | +-+ - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - And if it's free, it looks like this: - - chunk-> +- -+ - | User payload (must be in use, or we would have merged!) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| - | Size of this chunk 0| +-+ - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Next pointer | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Prev pointer | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | : - +- size - sizeof(struct chunk) unused bytes -+ - : | - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of this chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0| - | Size of next chunk (must be in use, or we would have merged)| +-+ - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | : - +- User payload -+ - : | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - |0| - +-+ - Note that since we always merge adjacent free chunks, the chunks - adjacent to a free chunk must be in use. - - Given a pointer to a chunk (which can be derived trivially from the - payload pointer) we can, in O(1) time, find out whether the adjacent - chunks are free, and if so, unlink them from the lists that they - are on and merge them with the current chunk. - - Chunks always begin on even word boundaries, so the mem portion - (which is returned to the user) is also on an even word boundary, and - thus at least double-word aligned. - - The P (PINUSE_BIT) bit, stored in the unused low-order bit of the - chunk size (which is always a multiple of two words), is an in-use - bit for the *previous* chunk. If that bit is *clear*, then the - word before the current chunk size contains the previous chunk - size, and can be used to find the front of the previous chunk. - The very first chunk allocated always has this bit set, preventing - access to non-existent (or non-owned) memory. If pinuse is set for - any given chunk, then you CANNOT determine the size of the - previous chunk, and might even get a memory addressing fault when - trying to do so. - - The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of - the chunk size redundantly records whether the current chunk is - inuse (unless the chunk is mmapped). This redundancy enables usage - checks within free and realloc, and reduces indirection when freeing - and consolidating chunks. - - Each freshly allocated chunk must have both cinuse and pinuse set. - That is, each allocated chunk borders either a previously allocated - and still in-use chunk, or the base of its memory arena. This is - ensured by making all allocations from the `lowest' part of any - found chunk. Further, no free chunk physically borders another one, - so each free chunk is known to be preceded and followed by either - inuse chunks or the ends of memory. - - Note that the `foot' of the current chunk is actually represented - as the prev_foot of the NEXT chunk. This makes it easier to - deal with alignments etc but can be very confusing when trying - to extend or adapt this code. - - The exceptions to all this are - - 1. The special chunk `top' is the top-most available chunk (i.e., - the one bordering the end of available memory). It is treated - specially. Top is never included in any bin, is used only if - no other chunk is available, and is released back to the - system if it is very large (see M_TRIM_THRESHOLD). In effect, - the top chunk is treated as larger (and thus less well - fitting) than any other available chunk. The top chunk - doesn't update its trailing size field since there is no next - contiguous chunk that would have to index off it. However, - space is still allocated for it (TOP_FOOT_SIZE) to enable - separation or merging when space is extended. - - 3. Chunks allocated via mmap, have both cinuse and pinuse bits - cleared in their head fields. Because they are allocated - one-by-one, each must carry its own prev_foot field, which is - also used to hold the offset this chunk has within its mmapped - region, which is needed to preserve alignment. Each mmapped - chunk is trailed by the first two fields of a fake next-chunk - for sake of usage checks. - -*/ - -struct malloc_chunk { - size_t prev_foot; /* Size of previous chunk (if free). */ - size_t head; /* Size and inuse bits. */ - struct malloc_chunk* fd; /* double links -- used only if free. */ - struct malloc_chunk* bk; -}; - -typedef struct malloc_chunk mchunk; -typedef struct malloc_chunk* mchunkptr; -typedef struct malloc_chunk* sbinptr; /* The type of bins of chunks */ -typedef unsigned int bindex_t; /* Described below */ -typedef unsigned int binmap_t; /* Described below */ -typedef unsigned int flag_t; /* The type of various bit flag sets */ - -/* ------------------- Chunks sizes and alignments ----------------------- */ - -#define MCHUNK_SIZE (sizeof(mchunk)) - -#if FOOTERS -#define CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) -#else /* FOOTERS */ -#define CHUNK_OVERHEAD (SIZE_T_SIZE) -#endif /* FOOTERS */ - -/* MMapped chunks need a second word of overhead ... */ -#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) -/* ... and additional padding for fake next-chunk at foot */ -#define MMAP_FOOT_PAD (FOUR_SIZE_T_SIZES) - -/* The smallest size we can malloc is an aligned minimal chunk */ -#define MIN_CHUNK_SIZE\ - ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) - -/* conversion from malloc headers to user pointers, and back */ -#define chunk2mem(p) ((void*)((char*)(p) + TWO_SIZE_T_SIZES)) -#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES)) -/* chunk associated with aligned address A */ -#define align_as_chunk(A) (mchunkptr)((A) + align_offset(chunk2mem(A))) - -/* Bounds on request (not chunk) sizes. */ -#define MAX_REQUEST ((-MIN_CHUNK_SIZE) << 2) -#define MIN_REQUEST (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE) - -/* pad request bytes into a usable size */ -#define pad_request(req) \ - (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) - -/* pad request, checking for minimum (but not maximum) */ -#define request2size(req) \ - (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req)) - - -/* ------------------ Operations on head and foot fields ----------------- */ - -/* - The head field of a chunk is or'ed with PINUSE_BIT when previous - adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in - use, unless mmapped, in which case both bits are cleared. - - FLAG4_BIT is not used by this malloc, but might be useful in extensions. -*/ - -#define PINUSE_BIT (SIZE_T_ONE) -#define CINUSE_BIT (SIZE_T_TWO) -#define FLAG4_BIT (SIZE_T_FOUR) -#define INUSE_BITS (PINUSE_BIT|CINUSE_BIT) -#define FLAG_BITS (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT) - -/* Head value for fenceposts */ -#define FENCEPOST_HEAD (INUSE_BITS|SIZE_T_SIZE) - -/* extraction of fields from head words */ -#define cinuse(p) ((p)->head & CINUSE_BIT) -#define pinuse(p) ((p)->head & PINUSE_BIT) -#define flag4inuse(p) ((p)->head & FLAG4_BIT) -#define is_inuse(p) (((p)->head & INUSE_BITS) != PINUSE_BIT) -#define is_mmapped(p) (((p)->head & INUSE_BITS) == 0) - -#define chunksize(p) ((p)->head & ~(FLAG_BITS)) - -#define clear_pinuse(p) ((p)->head &= ~PINUSE_BIT) -#define set_flag4(p) ((p)->head |= FLAG4_BIT) -#define clear_flag4(p) ((p)->head &= ~FLAG4_BIT) - -/* Treat space at ptr +/- offset as a chunk */ -#define chunk_plus_offset(p, s) ((mchunkptr)(((char*)(p)) + (s))) -#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s))) - -/* Ptr to next or previous physical malloc_chunk. */ -#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS))) -#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) )) - -/* extract next chunk's pinuse bit */ -#define next_pinuse(p) ((next_chunk(p)->head) & PINUSE_BIT) - -/* Get/set size at footer */ -#define get_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot) -#define set_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s)) - -/* Set size, pinuse bit, and foot */ -#define set_size_and_pinuse_of_free_chunk(p, s)\ - ((p)->head = (s|PINUSE_BIT), set_foot(p, s)) - -/* Set size, pinuse bit, foot, and clear next pinuse */ -#define set_free_with_pinuse(p, s, n)\ - (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s)) - -/* Get the internal overhead associated with chunk p */ -#define overhead_for(p)\ - (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD) - -/* Return true if malloced space is not necessarily cleared */ -#if MMAP_CLEARS -#define calloc_must_clear(p) (!is_mmapped(p)) -#else /* MMAP_CLEARS */ -#define calloc_must_clear(p) (1) -#endif /* MMAP_CLEARS */ - -/* ---------------------- Overlaid data structures ----------------------- */ - -/* - When chunks are not in use, they are treated as nodes of either - lists or trees. - - "Small" chunks are stored in circular doubly-linked lists, and look - like this: - - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of previous chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `head:' | Size of chunk, in bytes |P| - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Forward pointer to next chunk in list | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Back pointer to previous chunk in list | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Unused space (may be 0 bytes long) . - . . - . | -nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `foot:' | Size of chunk, in bytes | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - Larger chunks are kept in a form of bitwise digital trees (aka - tries) keyed on chunksizes. Because malloc_tree_chunks are only for - free chunks greater than 256 bytes, their size doesn't impose any - constraints on user chunk sizes. Each node looks like: - - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of previous chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `head:' | Size of chunk, in bytes |P| - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Forward pointer to next chunk of same size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Back pointer to previous chunk of same size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Pointer to left child (child[0]) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Pointer to right child (child[1]) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Pointer to parent | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | bin index of this chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Unused space . - . | -nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `foot:' | Size of chunk, in bytes | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - Each tree holding treenodes is a tree of unique chunk sizes. Chunks - of the same size are arranged in a circularly-linked list, with only - the oldest chunk (the next to be used, in our FIFO ordering) - actually in the tree. (Tree members are distinguished by a non-null - parent pointer.) If a chunk with the same size an an existing node - is inserted, it is linked off the existing node using pointers that - work in the same way as fd/bk pointers of small chunks. - - Each tree contains a power of 2 sized range of chunk sizes (the - smallest is 0x100 <= x < 0x180), which is is divided in half at each - tree level, with the chunks in the smaller half of the range (0x100 - <= x < 0x140 for the top nose) in the left subtree and the larger - half (0x140 <= x < 0x180) in the right subtree. This is, of course, - done by inspecting individual bits. - - Using these rules, each node's left subtree contains all smaller - sizes than its right subtree. However, the node at the root of each - subtree has no particular ordering relationship to either. (The - dividing line between the subtree sizes is based on trie relation.) - If we remove the last chunk of a given size from the interior of the - tree, we need to replace it with a leaf node. The tree ordering - rules permit a node to be replaced by any leaf below it. - - The smallest chunk in a tree (a common operation in a best-fit - allocator) can be found by walking a path to the leftmost leaf in - the tree. Unlike a usual binary tree, where we follow left child - pointers until we reach a null, here we follow the right child - pointer any time the left one is null, until we reach a leaf with - both child pointers null. The smallest chunk in the tree will be - somewhere along that path. - - The worst case number of steps to add, find, or remove a node is - bounded by the number of bits differentiating chunks within - bins. Under current bin calculations, this ranges from 6 up to 21 - (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case - is of course much better. -*/ - -struct malloc_tree_chunk { - /* The first four fields must be compatible with malloc_chunk */ - size_t prev_foot; - size_t head; - struct malloc_tree_chunk* fd; - struct malloc_tree_chunk* bk; - - struct malloc_tree_chunk* child[2]; - struct malloc_tree_chunk* parent; - bindex_t index; -}; - -typedef struct malloc_tree_chunk tchunk; -typedef struct malloc_tree_chunk* tchunkptr; -typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */ - -/* A little helper macro for trees */ -#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1]) - -/* ----------------------------- Segments -------------------------------- */ - -/* - Each malloc space may include non-contiguous segments, held in a - list headed by an embedded malloc_segment record representing the - top-most space. Segments also include flags holding properties of - the space. Large chunks that are directly allocated by mmap are not - included in this list. They are instead independently created and - destroyed without otherwise keeping track of them. - - Segment management mainly comes into play for spaces allocated by - MMAP. Any call to MMAP might or might not return memory that is - adjacent to an existing segment. MORECORE normally contiguously - extends the current space, so this space is almost always adjacent, - which is simpler and faster to deal with. (This is why MORECORE is - used preferentially to MMAP when both are available -- see - sys_alloc.) When allocating using MMAP, we don't use any of the - hinting mechanisms (inconsistently) supported in various - implementations of unix mmap, or distinguish reserving from - committing memory. Instead, we just ask for space, and exploit - contiguity when we get it. It is probably possible to do - better than this on some systems, but no general scheme seems - to be significantly better. - - Management entails a simpler variant of the consolidation scheme - used for chunks to reduce fragmentation -- new adjacent memory is - normally prepended or appended to an existing segment. However, - there are limitations compared to chunk consolidation that mostly - reflect the fact that segment processing is relatively infrequent - (occurring only when getting memory from system) and that we - don't expect to have huge numbers of segments: - - * Segments are not indexed, so traversal requires linear scans. (It - would be possible to index these, but is not worth the extra - overhead and complexity for most programs on most platforms.) - * New segments are only appended to old ones when holding top-most - memory; if they cannot be prepended to others, they are held in - different segments. - - Except for the top-most segment of an mstate, each segment record - is kept at the tail of its segment. Segments are added by pushing - segment records onto the list headed by &mstate.seg for the - containing mstate. - - Segment flags control allocation/merge/deallocation policies: - * If EXTERN_BIT set, then we did not allocate this segment, - and so should not try to deallocate or merge with others. - (This currently holds only for the initial segment passed - into create_mspace_with_base.) - * If USE_MMAP_BIT set, the segment may be merged with - other surrounding mmapped segments and trimmed/de-allocated - using munmap. - * If neither bit is set, then the segment was obtained using - MORECORE so can be merged with surrounding MORECORE'd segments - and deallocated/trimmed using MORECORE with negative arguments. -*/ - -struct malloc_segment { - char* base; /* base address */ - size_t size; /* allocated size */ - struct malloc_segment* next; /* ptr to next segment */ - flag_t sflags; /* mmap and extern flag */ -}; - -#define is_mmapped_segment(S) ((S)->sflags & USE_MMAP_BIT) -#define is_extern_segment(S) ((S)->sflags & EXTERN_BIT) - -typedef struct malloc_segment msegment; -typedef struct malloc_segment* msegmentptr; - -/* ---------------------------- malloc_state ----------------------------- */ - -/* - A malloc_state holds all of the bookkeeping for a space. - The main fields are: - - Top - The topmost chunk of the currently active segment. Its size is - cached in topsize. The actual size of topmost space is - topsize+TOP_FOOT_SIZE, which includes space reserved for adding - fenceposts and segment records if necessary when getting more - space from the system. The size at which to autotrim top is - cached from mparams in trim_check, except that it is disabled if - an autotrim fails. - - Designated victim (dv) - This is the preferred chunk for servicing small requests that - don't have exact fits. It is normally the chunk split off most - recently to service another small request. Its size is cached in - dvsize. The link fields of this chunk are not maintained since it - is not kept in a bin. - - SmallBins - An array of bin headers for free chunks. These bins hold chunks - with sizes less than MIN_LARGE_SIZE bytes. Each bin contains - chunks of all the same size, spaced 8 bytes apart. To simplify - use in double-linked lists, each bin header acts as a malloc_chunk - pointing to the real first node, if it exists (else pointing to - itself). This avoids special-casing for headers. But to avoid - waste, we allocate only the fd/bk pointers of bins, and then use - repositioning tricks to treat these as the fields of a chunk. - - TreeBins - Treebins are pointers to the roots of trees holding a range of - sizes. There are 2 equally spaced treebins for each power of two - from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything - larger. - - Bin maps - There is one bit map for small bins ("smallmap") and one for - treebins ("treemap). Each bin sets its bit when non-empty, and - clears the bit when empty. Bit operations are then used to avoid - bin-by-bin searching -- nearly all "search" is done without ever - looking at bins that won't be selected. The bit maps - conservatively use 32 bits per map word, even if on 64bit system. - For a good description of some of the bit-based techniques used - here, see Henry S. Warren Jr's book "Hacker's Delight" (and - supplement at http://hackersdelight.org/). Many of these are - intended to reduce the branchiness of paths through malloc etc, as - well as to reduce the number of memory locations read or written. - - Segments - A list of segments headed by an embedded malloc_segment record - representing the initial space. - - Address check support - The least_addr field is the least address ever obtained from - MORECORE or MMAP. Attempted frees and reallocs of any address less - than this are trapped (unless INSECURE is defined). - - Magic tag - A cross-check field that should always hold same value as mparams.magic. - - Max allowed footprint - The maximum allowed bytes to allocate from system (zero means no limit) - - Flags - Bits recording whether to use MMAP, locks, or contiguous MORECORE - - Statistics - Each space keeps track of current and maximum system memory - obtained via MORECORE or MMAP. - - Trim support - Fields holding the amount of unused topmost memory that should trigger - trimming, and a counter to force periodic scanning to release unused - non-topmost segments. - - Locking - If USE_LOCKS is defined, the "mutex" lock is acquired and released - around every public call using this mspace. - - Extension support - A void* pointer and a size_t field that can be used to help implement - extensions to this malloc. -*/ - -/* Bin types, widths and sizes */ -#define NSMALLBINS (32U) -#define NTREEBINS (32U) -#define SMALLBIN_SHIFT (3U) -#define SMALLBIN_WIDTH (SIZE_T_ONE << SMALLBIN_SHIFT) -#define TREEBIN_SHIFT (8U) -#define MIN_LARGE_SIZE (SIZE_T_ONE << TREEBIN_SHIFT) -#define MAX_SMALL_SIZE (MIN_LARGE_SIZE - SIZE_T_ONE) -#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD) - -struct malloc_state { - binmap_t smallmap; - binmap_t treemap; - size_t dvsize; - size_t topsize; - char* least_addr; - mchunkptr dv; - mchunkptr top; - size_t trim_check; - size_t release_checks; - size_t magic; - mchunkptr smallbins[(NSMALLBINS+1)*2]; - tbinptr treebins[NTREEBINS]; - size_t footprint; - size_t max_footprint; - size_t footprint_limit; /* zero means no limit */ - flag_t mflags; -#if USE_LOCKS - MLOCK_T mutex; /* locate lock among fields that rarely change */ -#endif /* USE_LOCKS */ - msegment seg; - void* extp; /* Unused but available for extensions */ - size_t exts; -}; - -typedef struct malloc_state* mstate; - -/* ------------- Global malloc_state and malloc_params ------------------- */ - -/* - malloc_params holds global properties, including those that can be - dynamically set using mallopt. There is a single instance, mparams, - initialized in init_mparams. Note that the non-zeroness of "magic" - also serves as an initialization flag. -*/ - -struct malloc_params { - size_t magic; - size_t page_size; - size_t granularity; - size_t mmap_threshold; - size_t trim_threshold; - flag_t default_mflags; -}; - -static struct malloc_params mparams; - -/* Ensure mparams initialized */ -#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams()) - -#if !ONLY_MSPACES - -/* The global malloc_state used for all non-"mspace" calls */ -static struct malloc_state _gm_; -#define gm (&_gm_) -#define is_global(M) ((M) == &_gm_) - -#endif /* !ONLY_MSPACES */ - -#define is_initialized(M) ((M)->top != 0) - -/* -------------------------- system alloc setup ------------------------- */ - -/* Operations on mflags */ - -#define use_lock(M) ((M)->mflags & USE_LOCK_BIT) -#define enable_lock(M) ((M)->mflags |= USE_LOCK_BIT) -#if USE_LOCKS -#define disable_lock(M) ((M)->mflags &= ~USE_LOCK_BIT) -#else -#define disable_lock(M) -#endif - -#define use_mmap(M) ((M)->mflags & USE_MMAP_BIT) -#define enable_mmap(M) ((M)->mflags |= USE_MMAP_BIT) -#if HAVE_MMAP -#define disable_mmap(M) ((M)->mflags &= ~USE_MMAP_BIT) -#else -#define disable_mmap(M) -#endif - -#define use_noncontiguous(M) ((M)->mflags & USE_NONCONTIGUOUS_BIT) -#define disable_contiguous(M) ((M)->mflags |= USE_NONCONTIGUOUS_BIT) - -#define set_lock(M,L)\ - ((M)->mflags = (L)?\ - ((M)->mflags | USE_LOCK_BIT) :\ - ((M)->mflags & ~USE_LOCK_BIT)) - -/* page-align a size */ -#define page_align(S)\ - (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE)) - -/* granularity-align a size */ -#define granularity_align(S)\ - (((S) + (mparams.granularity - SIZE_T_ONE))\ - & ~(mparams.granularity - SIZE_T_ONE)) - - -/* For mmap, use granularity alignment on windows, else page-align */ -#ifdef WIN32 -#define mmap_align(S) granularity_align(S) -#else -#define mmap_align(S) page_align(S) -#endif - -/* For sys_alloc, enough padding to ensure can malloc request on success */ -#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT) - -#define is_page_aligned(S)\ - (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0) -#define is_granularity_aligned(S)\ - (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0) - -/* True if segment S holds address A */ -#define segment_holds(S, A)\ - ((char*)(A) >= S->base && (char*)(A) < S->base + S->size) - -/* Return segment holding given address */ -static msegmentptr segment_holding(mstate m, char* addr) { - msegmentptr sp = &m->seg; - for (;;) { - if (addr >= sp->base && addr < sp->base + sp->size) - return sp; - if ((sp = sp->next) == 0) - return 0; - } - return 0; -} - -/* Return true if segment contains a segment link */ -static int has_segment_link(mstate m, msegmentptr ss) { - msegmentptr sp = &m->seg; - for (;;) { - if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size) - return 1; - if ((sp = sp->next) == 0) - return 0; - } - return 0; -} - -#ifndef MORECORE_CANNOT_TRIM -#define should_trim(M,s) ((s) > (M)->trim_check) -#else /* MORECORE_CANNOT_TRIM */ -#define should_trim(M,s) (0) -#endif /* MORECORE_CANNOT_TRIM */ - -/* - TOP_FOOT_SIZE is padding at the end of a segment, including space - that may be needed to place segment records and fenceposts when new - noncontiguous segments are added. -*/ -#define TOP_FOOT_SIZE\ - (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE) - - -/* ------------------------------- Hooks -------------------------------- */ - -/* - PREACTION should be defined to return 0 on success, and nonzero on - failure. If you are not using locking, you can redefine these to do - anything you like. -*/ - -#if USE_LOCKS -#define PREACTION(M) ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0) -#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); } -#else /* USE_LOCKS */ - -#ifndef PREACTION -#define PREACTION(M) (0) -#endif /* PREACTION */ - -#ifndef POSTACTION -#define POSTACTION(M) -#endif /* POSTACTION */ - -#endif /* USE_LOCKS */ - -/* - CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses. - USAGE_ERROR_ACTION is triggered on detected bad frees and - reallocs. The argument p is an address that might have triggered the - fault. It is ignored by the two predefined actions, but might be - useful in custom actions that try to help diagnose errors. -*/ - -#if PROCEED_ON_ERROR - -/* A count of the number of corruption errors causing resets */ -int malloc_corruption_error_count; - -/* default corruption action */ -static void reset_on_error(mstate m); - -#define CORRUPTION_ERROR_ACTION(m) reset_on_error(m) -#define USAGE_ERROR_ACTION(m, p) - -#else /* PROCEED_ON_ERROR */ - -#ifndef CORRUPTION_ERROR_ACTION -#define CORRUPTION_ERROR_ACTION(m) ABORT -#endif /* CORRUPTION_ERROR_ACTION */ - -#ifndef USAGE_ERROR_ACTION -#define USAGE_ERROR_ACTION(m,p) ABORT -#endif /* USAGE_ERROR_ACTION */ - -#endif /* PROCEED_ON_ERROR */ - - -/* -------------------------- Debugging setup ---------------------------- */ - -#if ! DEBUG - -#define check_free_chunk(M,P) -#define check_inuse_chunk(M,P) -#define check_malloced_chunk(M,P,N) -#define check_mmapped_chunk(M,P) -#define check_malloc_state(M) -#define check_top_chunk(M,P) - -#else /* DEBUG */ -#define check_free_chunk(M,P) do_check_free_chunk(M,P) -#define check_inuse_chunk(M,P) do_check_inuse_chunk(M,P) -#define check_top_chunk(M,P) do_check_top_chunk(M,P) -#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N) -#define check_mmapped_chunk(M,P) do_check_mmapped_chunk(M,P) -#define check_malloc_state(M) do_check_malloc_state(M) - -static void do_check_any_chunk(mstate m, mchunkptr p); -static void do_check_top_chunk(mstate m, mchunkptr p); -static void do_check_mmapped_chunk(mstate m, mchunkptr p); -static void do_check_inuse_chunk(mstate m, mchunkptr p); -static void do_check_free_chunk(mstate m, mchunkptr p); -static void do_check_malloced_chunk(mstate m, void* mem, size_t s); -static void do_check_tree(mstate m, tchunkptr t); -static void do_check_treebin(mstate m, bindex_t i); -static void do_check_smallbin(mstate m, bindex_t i); -static void do_check_malloc_state(mstate m); -static int bin_find(mstate m, mchunkptr x); -static size_t traverse_and_check(mstate m); -#endif /* DEBUG */ - -/* ---------------------------- Indexing Bins ---------------------------- */ - -#define is_small(s) (((s) >> SMALLBIN_SHIFT) < NSMALLBINS) -#define small_index(s) (bindex_t)((s) >> SMALLBIN_SHIFT) -#define small_index2size(i) ((i) << SMALLBIN_SHIFT) -#define MIN_SMALL_INDEX (small_index(MIN_CHUNK_SIZE)) - -/* addressing by index. See above about smallbin repositioning */ -#define smallbin_at(M, i) ((sbinptr)((char*)&((M)->smallbins[(i)<<1]))) -#define treebin_at(M,i) (&((M)->treebins[i])) - -/* assign tree index for size S to variable I. Use x86 asm if possible */ -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) -#define compute_tree_index(S, I)\ -{\ - unsigned int X = S >> TREEBIN_SHIFT;\ - if (X == 0)\ - I = 0;\ - else if (X > 0xFFFF)\ - I = NTREEBINS-1;\ - else {\ - unsigned int K = (unsigned) sizeof(X)*__CHAR_BIT__ - 1 - (unsigned) __builtin_clz(X); \ - I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ - }\ -} - -#elif defined (__INTEL_COMPILER) -#define compute_tree_index(S, I)\ -{\ - size_t X = S >> TREEBIN_SHIFT;\ - if (X == 0)\ - I = 0;\ - else if (X > 0xFFFF)\ - I = NTREEBINS-1;\ - else {\ - unsigned int K = _bit_scan_reverse (X); \ - I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ - }\ -} - -#elif defined(_MSC_VER) && _MSC_VER>=1300 -#define compute_tree_index(S, I)\ -{\ - size_t X = S >> TREEBIN_SHIFT;\ - if (X == 0)\ - I = 0;\ - else if (X > 0xFFFF)\ - I = NTREEBINS-1;\ - else {\ - unsigned int K;\ - _BitScanReverse((DWORD *) &K, (DWORD) X);\ - I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ - }\ -} - -#else /* GNUC */ -#define compute_tree_index(S, I)\ -{\ - size_t X = S >> TREEBIN_SHIFT;\ - if (X == 0)\ - I = 0;\ - else if (X > 0xFFFF)\ - I = NTREEBINS-1;\ - else {\ - unsigned int Y = (unsigned int)X;\ - unsigned int N = ((Y - 0x100) >> 16) & 8;\ - unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\ - N += K;\ - N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\ - K = 14 - N + ((Y <<= K) >> 15);\ - I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\ - }\ -} -#endif /* GNUC */ - -/* Bit representing maximum resolved size in a treebin at i */ -#define bit_for_tree_index(i) \ - (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2) - -/* Shift placing maximum resolved bit in a treebin at i as sign bit */ -#define leftshift_for_tree_index(i) \ - ((i == NTREEBINS-1)? 0 : \ - ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2))) - -/* The size of the smallest chunk held in bin with index i */ -#define minsize_for_tree_index(i) \ - ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) | \ - (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1))) - - -/* ------------------------ Operations on bin maps ----------------------- */ - -/* bit corresponding to given index */ -#define idx2bit(i) ((binmap_t)(1) << (i)) - -/* Mark/Clear bits with given index */ -#define mark_smallmap(M,i) ((M)->smallmap |= idx2bit(i)) -#define clear_smallmap(M,i) ((M)->smallmap &= ~idx2bit(i)) -#define smallmap_is_marked(M,i) ((M)->smallmap & idx2bit(i)) - -#define mark_treemap(M,i) ((M)->treemap |= idx2bit(i)) -#define clear_treemap(M,i) ((M)->treemap &= ~idx2bit(i)) -#define treemap_is_marked(M,i) ((M)->treemap & idx2bit(i)) - -/* isolate the least set bit of a bitmap */ -#define least_bit(x) ((x) & -(x)) - -/* mask with all bits to left of least bit of x on */ -#define left_bits(x) ((x<<1) | -(x<<1)) - -/* mask with all bits to left of or equal to least bit of x on */ -#define same_or_left_bits(x) ((x) | -(x)) - -/* index corresponding to given bit. Use x86 asm if possible */ - -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) -#define compute_bit2idx(X, I)\ -{\ - unsigned int J;\ - J = __builtin_ctz(X); \ - I = (bindex_t)J;\ -} - -#elif defined (__INTEL_COMPILER) -#define compute_bit2idx(X, I)\ -{\ - unsigned int J;\ - J = _bit_scan_forward (X); \ - I = (bindex_t)J;\ -} - -#elif defined(_MSC_VER) && _MSC_VER>=1300 -#define compute_bit2idx(X, I)\ -{\ - unsigned int J;\ - _BitScanForward((DWORD *) &J, X);\ - I = (bindex_t)J;\ -} - -#elif USE_BUILTIN_FFS -#define compute_bit2idx(X, I) I = ffs(X)-1 - -#else -#define compute_bit2idx(X, I)\ -{\ - unsigned int Y = X - 1;\ - unsigned int K = Y >> (16-4) & 16;\ - unsigned int N = K; Y >>= K;\ - N += K = Y >> (8-3) & 8; Y >>= K;\ - N += K = Y >> (4-2) & 4; Y >>= K;\ - N += K = Y >> (2-1) & 2; Y >>= K;\ - N += K = Y >> (1-0) & 1; Y >>= K;\ - I = (bindex_t)(N + Y);\ -} -#endif /* GNUC */ - - -/* ----------------------- Runtime Check Support ------------------------- */ - -/* - For security, the main invariant is that malloc/free/etc never - writes to a static address other than malloc_state, unless static - malloc_state itself has been corrupted, which cannot occur via - malloc (because of these checks). In essence this means that we - believe all pointers, sizes, maps etc held in malloc_state, but - check all of those linked or offsetted from other embedded data - structures. These checks are interspersed with main code in a way - that tends to minimize their run-time cost. - - When FOOTERS is defined, in addition to range checking, we also - verify footer fields of inuse chunks, which can be used guarantee - that the mstate controlling malloc/free is intact. This is a - streamlined version of the approach described by William Robertson - et al in "Run-time Detection of Heap-based Overflows" LISA'03 - http://www.usenix.org/events/lisa03/tech/robertson.html The footer - of an inuse chunk holds the xor of its mstate and a random seed, - that is checked upon calls to free() and realloc(). This is - (probabalistically) unguessable from outside the program, but can be - computed by any code successfully malloc'ing any chunk, so does not - itself provide protection against code that has already broken - security through some other means. Unlike Robertson et al, we - always dynamically check addresses of all offset chunks (previous, - next, etc). This turns out to be cheaper than relying on hashes. -*/ - -#if !INSECURE -/* Check if address a is at least as high as any from MORECORE or MMAP */ -#define ok_address(M, a) ((char*)(a) >= (M)->least_addr) -/* Check if address of next chunk n is higher than base chunk p */ -#define ok_next(p, n) ((char*)(p) < (char*)(n)) -/* Check if p has inuse status */ -#define ok_inuse(p) is_inuse(p) -/* Check if p has its pinuse bit on */ -#define ok_pinuse(p) pinuse(p) - -#else /* !INSECURE */ -#define ok_address(M, a) (1) -#define ok_next(b, n) (1) -#define ok_inuse(p) (1) -#define ok_pinuse(p) (1) -#endif /* !INSECURE */ - -#if (FOOTERS && !INSECURE) -/* Check if (alleged) mstate m has expected magic field */ -#define ok_magic(M) ((M)->magic == mparams.magic) -#else /* (FOOTERS && !INSECURE) */ -#define ok_magic(M) (1) -#endif /* (FOOTERS && !INSECURE) */ - -/* In gcc, use __builtin_expect to minimize impact of checks */ -#if !INSECURE -#if defined(__GNUC__) && __GNUC__ >= 3 -#define RTCHECK(e) __builtin_expect(e, 1) -#else /* GNUC */ -#define RTCHECK(e) (e) -#endif /* GNUC */ -#else /* !INSECURE */ -#define RTCHECK(e) (1) -#endif /* !INSECURE */ - -/* macros to set up inuse chunks with or without footers */ - -#if !FOOTERS - -#define mark_inuse_foot(M,p,s) - -/* Macros for setting head/foot of non-mmapped chunks */ - -/* Set cinuse bit and pinuse bit of next chunk */ -#define set_inuse(M,p,s)\ - ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ - ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) - -/* Set cinuse and pinuse of this chunk and pinuse of next chunk */ -#define set_inuse_and_pinuse(M,p,s)\ - ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ - ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) - -/* Set size, cinuse and pinuse bit of this chunk */ -#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ - ((p)->head = (s|PINUSE_BIT|CINUSE_BIT)) - -#else /* FOOTERS */ - -/* Set foot of inuse chunk to be xor of mstate and seed */ -#define mark_inuse_foot(M,p,s)\ - (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic)) - -#define get_mstate_for(p)\ - ((mstate)(((mchunkptr)((char*)(p) +\ - (chunksize(p))))->prev_foot ^ mparams.magic)) - -#define set_inuse(M,p,s)\ - ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ - (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \ - mark_inuse_foot(M,p,s)) - -#define set_inuse_and_pinuse(M,p,s)\ - ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ - (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\ - mark_inuse_foot(M,p,s)) - -#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ - ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ - mark_inuse_foot(M, p, s)) - -#endif /* !FOOTERS */ - -/* ---------------------------- setting mparams -------------------------- */ - -#if LOCK_AT_FORK -static void pre_fork(void) { ACQUIRE_LOCK(&(gm)->mutex); } -static void post_fork_parent(void) { RELEASE_LOCK(&(gm)->mutex); } -static void post_fork_child(void) { INITIAL_LOCK(&(gm)->mutex); } -#endif /* LOCK_AT_FORK */ - -/* Initialize mparams */ -static int init_mparams(void) { -#ifdef NEED_GLOBAL_LOCK_INIT - if (malloc_global_mutex_status <= 0) - init_malloc_global_mutex(); -#endif - - ACQUIRE_MALLOC_GLOBAL_LOCK(); - if (mparams.magic == 0) { - size_t magic; - size_t psize; - size_t gsize; - -#ifndef WIN32 - psize = malloc_getpagesize; - gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize); -#else /* WIN32 */ - { - SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - psize = system_info.dwPageSize; - gsize = ((DEFAULT_GRANULARITY != 0)? - DEFAULT_GRANULARITY : system_info.dwAllocationGranularity); - } -#endif /* WIN32 */ - - /* Sanity-check configuration: - size_t must be unsigned and as wide as pointer type. - ints must be at least 4 bytes. - alignment must be at least 8. - Alignment, min chunk size, and page size must all be powers of 2. - */ - if ((sizeof(size_t) != sizeof(char*)) || - (MAX_SIZE_T < MIN_CHUNK_SIZE) || - (sizeof(int) < 4) || - (MALLOC_ALIGNMENT < (size_t)8U) || - ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) || - ((MCHUNK_SIZE & (MCHUNK_SIZE-SIZE_T_ONE)) != 0) || - ((gsize & (gsize-SIZE_T_ONE)) != 0) || - ((psize & (psize-SIZE_T_ONE)) != 0)) - ABORT; - mparams.granularity = gsize; - mparams.page_size = psize; - mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD; - mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD; -#if MORECORE_CONTIGUOUS - mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT; -#else /* MORECORE_CONTIGUOUS */ - mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT; -#endif /* MORECORE_CONTIGUOUS */ - -#if !ONLY_MSPACES - /* Set up lock for main malloc area */ - gm->mflags = mparams.default_mflags; - (void)INITIAL_LOCK(&gm->mutex); -#endif -#if LOCK_AT_FORK - pthread_atfork(&pre_fork, &post_fork_parent, &post_fork_child); -#endif - - { -#if USE_DEV_RANDOM - int fd; - unsigned char buf[sizeof(size_t)]; - /* Try to use /dev/urandom, else fall back on using time */ - if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 && - read(fd, buf, sizeof(buf)) == sizeof(buf)) { - magic = *((size_t *) buf); - close(fd); - } - else -#endif /* USE_DEV_RANDOM */ -#ifdef WIN32 - magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U); -#elif defined(LACKS_TIME_H) - magic = (size_t)&magic ^ (size_t)0x55555555U; -#else - magic = (size_t)(time(0) ^ (size_t)0x55555555U); -#endif - magic |= (size_t)8U; /* ensure nonzero */ - magic &= ~(size_t)7U; /* improve chances of fault for bad values */ - /* Until memory modes commonly available, use volatile-write */ - (*(volatile size_t *)(&(mparams.magic))) = magic; - } - } - - RELEASE_MALLOC_GLOBAL_LOCK(); - return 1; -} - -/* support for mallopt */ -static int change_mparam(int param_number, int value) { - size_t val; - ensure_initialization(); - val = (value == -1)? MAX_SIZE_T : (size_t)value; - switch(param_number) { - case M_TRIM_THRESHOLD: - mparams.trim_threshold = val; - return 1; - case M_GRANULARITY: - if (val >= mparams.page_size && ((val & (val-1)) == 0)) { - mparams.granularity = val; - return 1; - } - else - return 0; - case M_MMAP_THRESHOLD: - mparams.mmap_threshold = val; - return 1; - default: - return 0; - } -} - -#if DEBUG -/* ------------------------- Debugging Support --------------------------- */ - -/* Check properties of any chunk, whether free, inuse, mmapped etc */ -static void do_check_any_chunk(mstate m, mchunkptr p) { - assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); - assert(ok_address(m, p)); -} - -/* Check properties of top chunk */ -static void do_check_top_chunk(mstate m, mchunkptr p) { - msegmentptr sp = segment_holding(m, (char*)p); - size_t sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */ - assert(sp != 0); - assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); - assert(ok_address(m, p)); - assert(sz == m->topsize); - assert(sz > 0); - assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE); - assert(pinuse(p)); - assert(!pinuse(chunk_plus_offset(p, sz))); -} - -/* Check properties of (inuse) mmapped chunks */ -static void do_check_mmapped_chunk(mstate m, mchunkptr p) { - size_t sz = chunksize(p); - size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD); - assert(is_mmapped(p)); - assert(use_mmap(m)); - assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); - assert(ok_address(m, p)); - assert(!is_small(sz)); - assert((len & (mparams.page_size-SIZE_T_ONE)) == 0); - assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD); - assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0); -} - -/* Check properties of inuse chunks */ -static void do_check_inuse_chunk(mstate m, mchunkptr p) { - do_check_any_chunk(m, p); - assert(is_inuse(p)); - assert(next_pinuse(p)); - /* If not pinuse and not mmapped, previous chunk has OK offset */ - assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p); - if (is_mmapped(p)) - do_check_mmapped_chunk(m, p); -} - -/* Check properties of free chunks */ -static void do_check_free_chunk(mstate m, mchunkptr p) { - size_t sz = chunksize(p); - mchunkptr next = chunk_plus_offset(p, sz); - do_check_any_chunk(m, p); - assert(!is_inuse(p)); - assert(!next_pinuse(p)); - assert (!is_mmapped(p)); - if (p != m->dv && p != m->top) { - if (sz >= MIN_CHUNK_SIZE) { - assert((sz & CHUNK_ALIGN_MASK) == 0); - assert(is_aligned(chunk2mem(p))); - assert(next->prev_foot == sz); - assert(pinuse(p)); - assert (next == m->top || is_inuse(next)); - assert(p->fd->bk == p); - assert(p->bk->fd == p); - } - else /* markers are always of size SIZE_T_SIZE */ - assert(sz == SIZE_T_SIZE); - } -} - -/* Check properties of malloced chunks at the point they are malloced */ -static void do_check_malloced_chunk(mstate m, void* mem, size_t s) { - if (mem != 0) { - mchunkptr p = mem2chunk(mem); - size_t sz = p->head & ~INUSE_BITS; - do_check_inuse_chunk(m, p); - assert((sz & CHUNK_ALIGN_MASK) == 0); - assert(sz >= MIN_CHUNK_SIZE); - assert(sz >= s); - /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */ - assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE)); - } -} - -/* Check a tree and its subtrees. */ -static void do_check_tree(mstate m, tchunkptr t) { - tchunkptr head = 0; - tchunkptr u = t; - bindex_t tindex = t->index; - size_t tsize = chunksize(t); - bindex_t idx; - compute_tree_index(tsize, idx); - assert(tindex == idx); - assert(tsize >= MIN_LARGE_SIZE); - assert(tsize >= minsize_for_tree_index(idx)); - assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1)))); - - do { /* traverse through chain of same-sized nodes */ - do_check_any_chunk(m, ((mchunkptr)u)); - assert(u->index == tindex); - assert(chunksize(u) == tsize); - assert(!is_inuse(u)); - assert(!next_pinuse(u)); - assert(u->fd->bk == u); - assert(u->bk->fd == u); - if (u->parent == 0) { - assert(u->child[0] == 0); - assert(u->child[1] == 0); - } - else { - assert(head == 0); /* only one node on chain has parent */ - head = u; - assert(u->parent != u); - assert (u->parent->child[0] == u || - u->parent->child[1] == u || - *((tbinptr*)(u->parent)) == u); - if (u->child[0] != 0) { - assert(u->child[0]->parent == u); - assert(u->child[0] != u); - do_check_tree(m, u->child[0]); - } - if (u->child[1] != 0) { - assert(u->child[1]->parent == u); - assert(u->child[1] != u); - do_check_tree(m, u->child[1]); - } - if (u->child[0] != 0 && u->child[1] != 0) { - assert(chunksize(u->child[0]) < chunksize(u->child[1])); - } - } - u = u->fd; - } while (u != t); - assert(head != 0); -} - -/* Check all the chunks in a treebin. */ -static void do_check_treebin(mstate m, bindex_t i) { - tbinptr* tb = treebin_at(m, i); - tchunkptr t = *tb; - int empty = (m->treemap & (1U << i)) == 0; - if (t == 0) - assert(empty); - if (!empty) - do_check_tree(m, t); -} - -/* Check all the chunks in a smallbin. */ -static void do_check_smallbin(mstate m, bindex_t i) { - sbinptr b = smallbin_at(m, i); - mchunkptr p = b->bk; - unsigned int empty = (m->smallmap & (1U << i)) == 0; - if (p == b) - assert(empty); - if (!empty) { - for (; p != b; p = p->bk) { - size_t size = chunksize(p); - mchunkptr q; - /* each chunk claims to be free */ - do_check_free_chunk(m, p); - /* chunk belongs in bin */ - assert(small_index(size) == i); - assert(p->bk == b || chunksize(p->bk) == chunksize(p)); - /* chunk is followed by an inuse chunk */ - q = next_chunk(p); - if (q->head != FENCEPOST_HEAD) - do_check_inuse_chunk(m, q); - } - } -} - -/* Find x in a bin. Used in other check functions. */ -static int bin_find(mstate m, mchunkptr x) { - size_t size = chunksize(x); - if (is_small(size)) { - bindex_t sidx = small_index(size); - sbinptr b = smallbin_at(m, sidx); - if (smallmap_is_marked(m, sidx)) { - mchunkptr p = b; - do { - if (p == x) - return 1; - } while ((p = p->fd) != b); - } - } - else { - bindex_t tidx; - compute_tree_index(size, tidx); - if (treemap_is_marked(m, tidx)) { - tchunkptr t = *treebin_at(m, tidx); - size_t sizebits = size << leftshift_for_tree_index(tidx); - while (t != 0 && chunksize(t) != size) { - t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; - sizebits <<= 1; - } - if (t != 0) { - tchunkptr u = t; - do { - if (u == (tchunkptr)x) - return 1; - } while ((u = u->fd) != t); - } - } - } - return 0; -} - -/* Traverse each chunk and check it; return total */ -static size_t traverse_and_check(mstate m) { - size_t sum = 0; - if (is_initialized(m)) { - msegmentptr s = &m->seg; - sum += m->topsize + TOP_FOOT_SIZE; - while (s != 0) { - mchunkptr q = align_as_chunk(s->base); - mchunkptr lastq = 0; - assert(pinuse(q)); - while (segment_holds(s, q) && - q != m->top && q->head != FENCEPOST_HEAD) { - sum += chunksize(q); - if (is_inuse(q)) { - assert(!bin_find(m, q)); - do_check_inuse_chunk(m, q); - } - else { - assert(q == m->dv || bin_find(m, q)); - assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */ - do_check_free_chunk(m, q); - } - lastq = q; - q = next_chunk(q); - } - s = s->next; - } - } - return sum; -} - - -/* Check all properties of malloc_state. */ -static void do_check_malloc_state(mstate m) { - bindex_t i; - size_t total; - /* check bins */ - for (i = 0; i < NSMALLBINS; ++i) - do_check_smallbin(m, i); - for (i = 0; i < NTREEBINS; ++i) - do_check_treebin(m, i); - - if (m->dvsize != 0) { /* check dv chunk */ - do_check_any_chunk(m, m->dv); - assert(m->dvsize == chunksize(m->dv)); - assert(m->dvsize >= MIN_CHUNK_SIZE); - assert(bin_find(m, m->dv) == 0); - } - - if (m->top != 0) { /* check top chunk */ - do_check_top_chunk(m, m->top); - /*assert(m->topsize == chunksize(m->top)); redundant */ - assert(m->topsize > 0); - assert(bin_find(m, m->top) == 0); - } - - total = traverse_and_check(m); - assert(total <= m->footprint); - assert(m->footprint <= m->max_footprint); -} -#endif /* DEBUG */ - -/* ----------------------------- statistics ------------------------------ */ - -#if !NO_MALLINFO -static struct mallinfo internal_mallinfo(mstate m) { - struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - ensure_initialization(); - if (!PREACTION(m)) { - check_malloc_state(m); - if (is_initialized(m)) { - size_t nfree = SIZE_T_ONE; /* top always free */ - size_t mfree = m->topsize + TOP_FOOT_SIZE; - size_t sum = mfree; - msegmentptr s = &m->seg; - while (s != 0) { - mchunkptr q = align_as_chunk(s->base); - while (segment_holds(s, q) && - q != m->top && q->head != FENCEPOST_HEAD) { - size_t sz = chunksize(q); - sum += sz; - if (!is_inuse(q)) { - mfree += sz; - ++nfree; - } - q = next_chunk(q); - } - s = s->next; - } - - nm.arena = sum; - nm.ordblks = nfree; - nm.hblkhd = m->footprint - sum; - nm.usmblks = m->max_footprint; - nm.uordblks = m->footprint - mfree; - nm.fordblks = mfree; - nm.keepcost = m->topsize; - } - - POSTACTION(m); - } - return nm; -} -#endif /* !NO_MALLINFO */ - -#if !NO_MALLOC_STATS -static void internal_malloc_stats(mstate m) { - ensure_initialization(); - if (!PREACTION(m)) { - size_t maxfp = 0; - size_t fp = 0; - size_t used = 0; - check_malloc_state(m); - if (is_initialized(m)) { - msegmentptr s = &m->seg; - maxfp = m->max_footprint; - fp = m->footprint; - used = fp - (m->topsize + TOP_FOOT_SIZE); - - while (s != 0) { - mchunkptr q = align_as_chunk(s->base); - while (segment_holds(s, q) && - q != m->top && q->head != FENCEPOST_HEAD) { - if (!is_inuse(q)) - used -= chunksize(q); - q = next_chunk(q); - } - s = s->next; - } - } - POSTACTION(m); /* drop lock */ - fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp)); - fprintf(stderr, "system bytes = %10lu\n", (unsigned long)(fp)); - fprintf(stderr, "in use bytes = %10lu\n", (unsigned long)(used)); - } -} -#endif /* NO_MALLOC_STATS */ - -/* ----------------------- Operations on smallbins ----------------------- */ - -/* - Various forms of linking and unlinking are defined as macros. Even - the ones for trees, which are very long but have very short typical - paths. This is ugly but reduces reliance on inlining support of - compilers. -*/ - -/* Link a free chunk into a smallbin */ -#define insert_small_chunk(M, P, S) {\ - bindex_t I = small_index(S);\ - mchunkptr B = smallbin_at(M, I);\ - mchunkptr F = B;\ - assert(S >= MIN_CHUNK_SIZE);\ - if (!smallmap_is_marked(M, I))\ - mark_smallmap(M, I);\ - else if (RTCHECK(ok_address(M, B->fd)))\ - F = B->fd;\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - B->fd = P;\ - F->bk = P;\ - P->fd = F;\ - P->bk = B;\ -} - -/* Unlink a chunk from a smallbin */ -#define unlink_small_chunk(M, P, S) {\ - mchunkptr F = P->fd;\ - mchunkptr B = P->bk;\ - bindex_t I = small_index(S);\ - assert(P != B);\ - assert(P != F);\ - assert(chunksize(P) == small_index2size(I));\ - if (RTCHECK(F == smallbin_at(M,I) || (ok_address(M, F) && F->bk == P))) { \ - if (B == F) {\ - clear_smallmap(M, I);\ - }\ - else if (RTCHECK(B == smallbin_at(M,I) ||\ - (ok_address(M, B) && B->fd == P))) {\ - F->bk = B;\ - B->fd = F;\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ -} - -/* Unlink the first chunk from a smallbin */ -#define unlink_first_small_chunk(M, B, P, I) {\ - mchunkptr F = P->fd;\ - assert(P != B);\ - assert(P != F);\ - assert(chunksize(P) == small_index2size(I));\ - if (B == F) {\ - clear_smallmap(M, I);\ - }\ - else if (RTCHECK(ok_address(M, F) && F->bk == P)) {\ - F->bk = B;\ - B->fd = F;\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ -} - -/* Replace dv node, binning the old one */ -/* Used only when dvsize known to be small */ -#define replace_dv(M, P, S) {\ - size_t DVS = M->dvsize;\ - assert(is_small(DVS));\ - if (DVS != 0) {\ - mchunkptr DV = M->dv;\ - insert_small_chunk(M, DV, DVS);\ - }\ - M->dvsize = S;\ - M->dv = P;\ -} - -/* ------------------------- Operations on trees ------------------------- */ - -/* Insert chunk into tree */ -#define insert_large_chunk(M, X, S) {\ - tbinptr* H;\ - bindex_t I;\ - compute_tree_index(S, I);\ - H = treebin_at(M, I);\ - X->index = I;\ - X->child[0] = X->child[1] = 0;\ - if (!treemap_is_marked(M, I)) {\ - mark_treemap(M, I);\ - *H = X;\ - X->parent = (tchunkptr)H;\ - X->fd = X->bk = X;\ - }\ - else {\ - tchunkptr T = *H;\ - size_t K = S << leftshift_for_tree_index(I);\ - for (;;) {\ - if (chunksize(T) != S) {\ - tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\ - K <<= 1;\ - if (*C != 0)\ - T = *C;\ - else if (RTCHECK(ok_address(M, C))) {\ - *C = X;\ - X->parent = T;\ - X->fd = X->bk = X;\ - break;\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - break;\ - }\ - }\ - else {\ - tchunkptr F = T->fd;\ - if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\ - T->fd = F->bk = X;\ - X->fd = F;\ - X->bk = T;\ - X->parent = 0;\ - break;\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - break;\ - }\ - }\ - }\ - }\ -} - -/* - Unlink steps: - - 1. If x is a chained node, unlink it from its same-sized fd/bk links - and choose its bk node as its replacement. - 2. If x was the last node of its size, but not a leaf node, it must - be replaced with a leaf node (not merely one with an open left or - right), to make sure that lefts and rights of descendents - correspond properly to bit masks. We use the rightmost descendent - of x. We could use any other leaf, but this is easy to locate and - tends to counteract removal of leftmosts elsewhere, and so keeps - paths shorter than minimally guaranteed. This doesn't loop much - because on average a node in a tree is near the bottom. - 3. If x is the base of a chain (i.e., has parent links) relink - x's parent and children to x's replacement (or null if none). -*/ - -#define unlink_large_chunk(M, X) {\ - tchunkptr XP = X->parent;\ - tchunkptr R;\ - if (X->bk != X) {\ - tchunkptr F = X->fd;\ - R = X->bk;\ - if (RTCHECK(ok_address(M, F) && F->bk == X && R->fd == X)) {\ - F->bk = R;\ - R->fd = F;\ - }\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - }\ - else {\ - tchunkptr* RP;\ - if (((R = *(RP = &(X->child[1]))) != 0) ||\ - ((R = *(RP = &(X->child[0]))) != 0)) {\ - tchunkptr* CP;\ - while ((*(CP = &(R->child[1])) != 0) ||\ - (*(CP = &(R->child[0])) != 0)) {\ - R = *(RP = CP);\ - }\ - if (RTCHECK(ok_address(M, RP)))\ - *RP = 0;\ - else {\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - }\ - }\ - if (XP != 0) {\ - tbinptr* H = treebin_at(M, X->index);\ - if (X == *H) {\ - if ((*H = R) == 0) \ - clear_treemap(M, X->index);\ - }\ - else if (RTCHECK(ok_address(M, XP))) {\ - if (XP->child[0] == X) \ - XP->child[0] = R;\ - else \ - XP->child[1] = R;\ - }\ - else\ - CORRUPTION_ERROR_ACTION(M);\ - if (R != 0) {\ - if (RTCHECK(ok_address(M, R))) {\ - tchunkptr C0, C1;\ - R->parent = XP;\ - if ((C0 = X->child[0]) != 0) {\ - if (RTCHECK(ok_address(M, C0))) {\ - R->child[0] = C0;\ - C0->parent = R;\ - }\ - else\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - if ((C1 = X->child[1]) != 0) {\ - if (RTCHECK(ok_address(M, C1))) {\ - R->child[1] = C1;\ - C1->parent = R;\ - }\ - else\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - }\ - else\ - CORRUPTION_ERROR_ACTION(M);\ - }\ - }\ -} - -/* Relays to large vs small bin operations */ - -#define insert_chunk(M, P, S)\ - if (is_small(S)) insert_small_chunk(M, P, S)\ - else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); } - -#define unlink_chunk(M, P, S)\ - if (is_small(S)) unlink_small_chunk(M, P, S)\ - else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); } - - -/* Relays to internal calls to malloc/free from realloc, memalign etc */ - -#if ONLY_MSPACES -#define internal_malloc(m, b) mspace_malloc(m, b) -#define internal_free(m, mem) mspace_free(m,mem); -#else /* ONLY_MSPACES */ -#if MSPACES -#define internal_malloc(m, b)\ - ((m == gm)? dlmalloc(b) : mspace_malloc(m, b)) -#define internal_free(m, mem)\ - if (m == gm) dlfree(mem); else mspace_free(m,mem); -#else /* MSPACES */ -#define internal_malloc(m, b) dlmalloc(b) -#define internal_free(m, mem) dlfree(mem) -#endif /* MSPACES */ -#endif /* ONLY_MSPACES */ - -/* ----------------------- Direct-mmapping chunks ----------------------- */ - -/* - Directly mmapped chunks are set up with an offset to the start of - the mmapped region stored in the prev_foot field of the chunk. This - allows reconstruction of the required argument to MUNMAP when freed, - and also allows adjustment of the returned chunk to meet alignment - requirements (especially in memalign). -*/ - -/* Malloc using mmap */ -static void* mmap_alloc(mstate m, size_t nb) { - size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); - if (m->footprint_limit != 0) { - size_t fp = m->footprint + mmsize; - if (fp <= m->footprint || fp > m->footprint_limit) - return 0; - } - if (mmsize > nb) { /* Check for wrap around 0 */ - char* mm = (char*)(CALL_DIRECT_MMAP(mmsize)); - if (mm != CMFAIL) { - size_t offset = align_offset(chunk2mem(mm)); - size_t psize = mmsize - offset - MMAP_FOOT_PAD; - mchunkptr p = (mchunkptr)(mm + offset); - p->prev_foot = offset; - p->head = psize; - mark_inuse_foot(m, p, psize); - chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD; - chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0; - - if (m->least_addr == 0 || mm < m->least_addr) - m->least_addr = mm; - if ((m->footprint += mmsize) > m->max_footprint) - m->max_footprint = m->footprint; - assert(is_aligned(chunk2mem(p))); - check_mmapped_chunk(m, p); - return chunk2mem(p); - } - } - return 0; -} - -/* Realloc using mmap */ -static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb, int flags) { - size_t oldsize = chunksize(oldp); - (void)flags; /* placate people compiling -Wunused */ - if (is_small(nb)) /* Can't shrink mmap regions below small size */ - return 0; - /* Keep old chunk if big enough but not too big */ - if (oldsize >= nb + SIZE_T_SIZE && - (oldsize - nb) <= (mparams.granularity << 1)) - return oldp; - else { - size_t offset = oldp->prev_foot; - size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD; - size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); - char* cp = (char*)CALL_MREMAP((char*)oldp - offset, - oldmmsize, newmmsize, flags); - if (cp != CMFAIL) { - mchunkptr newp = (mchunkptr)(cp + offset); - size_t psize = newmmsize - offset - MMAP_FOOT_PAD; - newp->head = psize; - mark_inuse_foot(m, newp, psize); - chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD; - chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0; - - if (cp < m->least_addr) - m->least_addr = cp; - if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint) - m->max_footprint = m->footprint; - check_mmapped_chunk(m, newp); - return newp; - } - } - return 0; -} - - -/* -------------------------- mspace management -------------------------- */ - -/* Initialize top chunk and its size */ -static void init_top(mstate m, mchunkptr p, size_t psize) { - /* Ensure alignment */ - size_t offset = align_offset(chunk2mem(p)); - p = (mchunkptr)((char*)p + offset); - psize -= offset; - - m->top = p; - m->topsize = psize; - p->head = psize | PINUSE_BIT; - /* set size of fake trailing chunk holding overhead space only once */ - chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE; - m->trim_check = mparams.trim_threshold; /* reset on each update */ -} - -/* Initialize bins for a new mstate that is otherwise zeroed out */ -static void init_bins(mstate m) { - /* Establish circular links for smallbins */ - bindex_t i; - for (i = 0; i < NSMALLBINS; ++i) { - sbinptr bin = smallbin_at(m,i); - bin->fd = bin->bk = bin; - } -} - -#if PROCEED_ON_ERROR - -/* default corruption action */ -static void reset_on_error(mstate m) { - int i; - ++malloc_corruption_error_count; - /* Reinitialize fields to forget about all memory */ - m->smallmap = m->treemap = 0; - m->dvsize = m->topsize = 0; - m->seg.base = 0; - m->seg.size = 0; - m->seg.next = 0; - m->top = m->dv = 0; - for (i = 0; i < NTREEBINS; ++i) - *treebin_at(m, i) = 0; - init_bins(m); -} -#endif /* PROCEED_ON_ERROR */ - -/* Allocate chunk and prepend remainder with chunk in successor base. */ -static void* prepend_alloc(mstate m, char* newbase, char* oldbase, - size_t nb) { - mchunkptr p = align_as_chunk(newbase); - mchunkptr oldfirst = align_as_chunk(oldbase); - size_t psize = (char*)oldfirst - (char*)p; - mchunkptr q = chunk_plus_offset(p, nb); - size_t qsize = psize - nb; - set_size_and_pinuse_of_inuse_chunk(m, p, nb); - - assert((char*)oldfirst > (char*)q); - assert(pinuse(oldfirst)); - assert(qsize >= MIN_CHUNK_SIZE); - - /* consolidate remainder with first chunk of old base */ - if (oldfirst == m->top) { - size_t tsize = m->topsize += qsize; - m->top = q; - q->head = tsize | PINUSE_BIT; - check_top_chunk(m, q); - } - else if (oldfirst == m->dv) { - size_t dsize = m->dvsize += qsize; - m->dv = q; - set_size_and_pinuse_of_free_chunk(q, dsize); - } - else { - if (!is_inuse(oldfirst)) { - size_t nsize = chunksize(oldfirst); - unlink_chunk(m, oldfirst, nsize); - oldfirst = chunk_plus_offset(oldfirst, nsize); - qsize += nsize; - } - set_free_with_pinuse(q, qsize, oldfirst); - insert_chunk(m, q, qsize); - check_free_chunk(m, q); - } - - check_malloced_chunk(m, chunk2mem(p), nb); - return chunk2mem(p); -} - -/* Add a segment to hold a new noncontiguous region */ -static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) { - /* Determine locations and sizes of segment, fenceposts, old top */ - char* old_top = (char*)m->top; - msegmentptr oldsp = segment_holding(m, old_top); - char* old_end = oldsp->base + oldsp->size; - size_t ssize = pad_request(sizeof(struct malloc_segment)); - char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK); - size_t offset = align_offset(chunk2mem(rawsp)); - char* asp = rawsp + offset; - char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp; - mchunkptr sp = (mchunkptr)csp; - msegmentptr ss = (msegmentptr)(chunk2mem(sp)); - mchunkptr tnext = chunk_plus_offset(sp, ssize); - mchunkptr p = tnext; - int nfences = 0; - - /* reset top to new space */ - init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); - - /* Set up segment record */ - assert(is_aligned(ss)); - set_size_and_pinuse_of_inuse_chunk(m, sp, ssize); - *ss = m->seg; /* Push current record */ - m->seg.base = tbase; - m->seg.size = tsize; - m->seg.sflags = mmapped; - m->seg.next = ss; - - /* Insert trailing fenceposts */ - for (;;) { - mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE); - p->head = FENCEPOST_HEAD; - ++nfences; - if ((char*)(&(nextp->head)) < old_end) - p = nextp; - else - break; - } - assert(nfences >= 2); - - /* Insert the rest of old top into a bin as an ordinary free chunk */ - if (csp != old_top) { - mchunkptr q = (mchunkptr)old_top; - size_t psize = csp - old_top; - mchunkptr tn = chunk_plus_offset(q, psize); - set_free_with_pinuse(q, psize, tn); - insert_chunk(m, q, psize); - } - - check_top_chunk(m, m->top); -} - -/* -------------------------- System allocation -------------------------- */ - -/* Get memory from system using MORECORE or MMAP */ -static void* sys_alloc(mstate m, size_t nb) { - char* tbase = CMFAIL; - size_t tsize = 0; - flag_t mmap_flag = 0; - size_t asize; /* allocation size */ - - ensure_initialization(); - - /* Directly map large chunks, but only if already initialized */ - if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) { - void* mem = mmap_alloc(m, nb); - if (mem != 0) - return mem; - } - - asize = granularity_align(nb + SYS_ALLOC_PADDING); - if (asize <= nb) - return 0; /* wraparound */ - if (m->footprint_limit != 0) { - size_t fp = m->footprint + asize; - if (fp <= m->footprint || fp > m->footprint_limit) - return 0; - } - - /* - Try getting memory in any of three ways (in most-preferred to - least-preferred order): - 1. A call to MORECORE that can normally contiguously extend memory. - (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or - or main space is mmapped or a previous contiguous call failed) - 2. A call to MMAP new space (disabled if not HAVE_MMAP). - Note that under the default settings, if MORECORE is unable to - fulfill a request, and HAVE_MMAP is true, then mmap is - used as a noncontiguous system allocator. This is a useful backup - strategy for systems with holes in address spaces -- in this case - sbrk cannot contiguously expand the heap, but mmap may be able to - find space. - 3. A call to MORECORE that cannot usually contiguously extend memory. - (disabled if not HAVE_MORECORE) - - In all cases, we need to request enough bytes from system to ensure - we can malloc nb bytes upon success, so pad with enough space for - top_foot, plus alignment-pad to make sure we don't lose bytes if - not on boundary, and round this up to a granularity unit. - */ - - if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) { - char* br = CMFAIL; - size_t ssize = asize; /* sbrk call size */ - msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top); - ACQUIRE_MALLOC_GLOBAL_LOCK(); - - if (ss == 0) { /* First time through or recovery */ - char* base = (char*)CALL_MORECORE(0); - if (base != CMFAIL) { - size_t fp; - /* Adjust to end on a page boundary */ - if (!is_page_aligned(base)) - ssize += (page_align((size_t)base) - (size_t)base); - fp = m->footprint + ssize; /* recheck limits */ - if (ssize > nb && ssize < HALF_MAX_SIZE_T && - (m->footprint_limit == 0 || - (fp > m->footprint && fp <= m->footprint_limit)) && - (br = (char*)(CALL_MORECORE(ssize))) == base) { - tbase = base; - tsize = ssize; - } - } - } - else { - /* Subtract out existing available top space from MORECORE request. */ - ssize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING); - /* Use mem here only if it did continuously extend old space */ - if (ssize < HALF_MAX_SIZE_T && - (br = (char*)(CALL_MORECORE(ssize))) == ss->base+ss->size) { - tbase = br; - tsize = ssize; - } - } - - if (tbase == CMFAIL) { /* Cope with partial failure */ - if (br != CMFAIL) { /* Try to use/extend the space we did get */ - if (ssize < HALF_MAX_SIZE_T && - ssize < nb + SYS_ALLOC_PADDING) { - size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - ssize); - if (esize < HALF_MAX_SIZE_T) { - char* end = (char*)CALL_MORECORE(esize); - if (end != CMFAIL) - ssize += esize; - else { /* Can't use; try to release */ - (void) CALL_MORECORE(-ssize); - br = CMFAIL; - } - } - } - } - if (br != CMFAIL) { /* Use the space we did get */ - tbase = br; - tsize = ssize; - } - else - disable_contiguous(m); /* Don't try contiguous path in the future */ - } - - RELEASE_MALLOC_GLOBAL_LOCK(); - } - - if (HAVE_MMAP && tbase == CMFAIL) { /* Try MMAP */ - char* mp = (char*)(CALL_MMAP(asize)); - if (mp != CMFAIL) { - tbase = mp; - tsize = asize; - mmap_flag = USE_MMAP_BIT; - } - } - - if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */ - if (asize < HALF_MAX_SIZE_T) { - char* br = CMFAIL; - char* end = CMFAIL; - ACQUIRE_MALLOC_GLOBAL_LOCK(); - br = (char*)(CALL_MORECORE(asize)); - end = (char*)(CALL_MORECORE(0)); - RELEASE_MALLOC_GLOBAL_LOCK(); - if (br != CMFAIL && end != CMFAIL && br < end) { - size_t ssize = end - br; - if (ssize > nb + TOP_FOOT_SIZE) { - tbase = br; - tsize = ssize; - } - } - } - } - - if (tbase != CMFAIL) { - - if ((m->footprint += tsize) > m->max_footprint) - m->max_footprint = m->footprint; - - if (!is_initialized(m)) { /* first-time initialization */ - if (m->least_addr == 0 || tbase < m->least_addr) - m->least_addr = tbase; - m->seg.base = tbase; - m->seg.size = tsize; - m->seg.sflags = mmap_flag; - m->magic = mparams.magic; - m->release_checks = MAX_RELEASE_CHECK_RATE; - init_bins(m); -#if !ONLY_MSPACES - if (is_global(m)) - init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); - else -#endif - { - /* Offset top by embedded malloc_state */ - mchunkptr mn = next_chunk(mem2chunk(m)); - init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE); - } - } - - else { - /* Try to merge with an existing segment */ - msegmentptr sp = &m->seg; - /* Only consider most recent segment if traversal suppressed */ - while (sp != 0 && tbase != sp->base + sp->size) - sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; - if (sp != 0 && - !is_extern_segment(sp) && - (sp->sflags & USE_MMAP_BIT) == mmap_flag && - segment_holds(sp, m->top)) { /* append */ - sp->size += tsize; - init_top(m, m->top, m->topsize + tsize); - } - else { - if (tbase < m->least_addr) - m->least_addr = tbase; - sp = &m->seg; - while (sp != 0 && sp->base != tbase + tsize) - sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; - if (sp != 0 && - !is_extern_segment(sp) && - (sp->sflags & USE_MMAP_BIT) == mmap_flag) { - char* oldbase = sp->base; - sp->base = tbase; - sp->size += tsize; - return prepend_alloc(m, tbase, oldbase, nb); - } - else - add_segment(m, tbase, tsize, mmap_flag); - } - } - - if (nb < m->topsize) { /* Allocate from new or extended top space */ - size_t rsize = m->topsize -= nb; - mchunkptr p = m->top; - mchunkptr r = m->top = chunk_plus_offset(p, nb); - r->head = rsize | PINUSE_BIT; - set_size_and_pinuse_of_inuse_chunk(m, p, nb); - check_top_chunk(m, m->top); - check_malloced_chunk(m, chunk2mem(p), nb); - return chunk2mem(p); - } - } - - MALLOC_FAILURE_ACTION; - return 0; -} - -/* ----------------------- system deallocation -------------------------- */ - -/* Unmap and unlink any mmapped segments that don't contain used chunks */ -static size_t release_unused_segments(mstate m) { - size_t released = 0; - int nsegs = 0; - msegmentptr pred = &m->seg; - msegmentptr sp = pred->next; - while (sp != 0) { - char* base = sp->base; - size_t size = sp->size; - msegmentptr next = sp->next; - ++nsegs; - if (is_mmapped_segment(sp) && !is_extern_segment(sp)) { - mchunkptr p = align_as_chunk(base); - size_t psize = chunksize(p); - /* Can unmap if first chunk holds entire segment and not pinned */ - if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) { - tchunkptr tp = (tchunkptr)p; - assert(segment_holds(sp, (char*)sp)); - if (p == m->dv) { - m->dv = 0; - m->dvsize = 0; - } - else { - unlink_large_chunk(m, tp); - } - if (CALL_MUNMAP(base, size) == 0) { - released += size; - m->footprint -= size; - /* unlink obsoleted record */ - sp = pred; - sp->next = next; - } - else { /* back out if cannot unmap */ - insert_large_chunk(m, tp, psize); - } - } - } - if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */ - break; - pred = sp; - sp = next; - } - /* Reset check counter */ - m->release_checks = (((size_t) nsegs > (size_t) MAX_RELEASE_CHECK_RATE)? - (size_t) nsegs : (size_t) MAX_RELEASE_CHECK_RATE); - return released; -} - -static int sys_trim(mstate m, size_t pad) { - size_t released = 0; - ensure_initialization(); - if (pad < MAX_REQUEST && is_initialized(m)) { - pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */ - - if (m->topsize > pad) { - /* Shrink top space in granularity-size units, keeping at least one */ - size_t unit = mparams.granularity; - size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit - - SIZE_T_ONE) * unit; - msegmentptr sp = segment_holding(m, (char*)m->top); - - if (!is_extern_segment(sp)) { - if (is_mmapped_segment(sp)) { - if (HAVE_MMAP && - sp->size >= extra && - !has_segment_link(m, sp)) { /* can't shrink if pinned */ - size_t newsize = sp->size - extra; - (void)newsize; /* placate people compiling -Wunused-variable */ - /* Prefer mremap, fall back to munmap */ - if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) || - (CALL_MUNMAP(sp->base + newsize, extra) == 0)) { - released = extra; - } - } - } - else if (HAVE_MORECORE) { - if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */ - extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit; - ACQUIRE_MALLOC_GLOBAL_LOCK(); - { - /* Make sure end of memory is where we last set it. */ - char* old_br = (char*)(CALL_MORECORE(0)); - if (old_br == sp->base + sp->size) { - char* rel_br = (char*)(CALL_MORECORE(-extra)); - char* new_br = (char*)(CALL_MORECORE(0)); - if (rel_br != CMFAIL && new_br < old_br) - released = old_br - new_br; - } - } - RELEASE_MALLOC_GLOBAL_LOCK(); - } - } - - if (released != 0) { - sp->size -= released; - m->footprint -= released; - init_top(m, m->top, m->topsize - released); - check_top_chunk(m, m->top); - } - } - - /* Unmap any unused mmapped segments */ - if (HAVE_MMAP) - released += release_unused_segments(m); - - /* On failure, disable autotrim to avoid repeated failed future calls */ - if (released == 0 && m->topsize > m->trim_check) - m->trim_check = MAX_SIZE_T; - } - - return (released != 0)? 1 : 0; -} - -/* Consolidate and bin a chunk. Differs from exported versions - of free mainly in that the chunk need not be marked as inuse. -*/ -static void dispose_chunk(mstate m, mchunkptr p, size_t psize) { - mchunkptr next = chunk_plus_offset(p, psize); - if (!pinuse(p)) { - mchunkptr prev; - size_t prevsize = p->prev_foot; - if (is_mmapped(p)) { - psize += prevsize + MMAP_FOOT_PAD; - if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) - m->footprint -= psize; - return; - } - prev = chunk_minus_offset(p, prevsize); - psize += prevsize; - p = prev; - if (RTCHECK(ok_address(m, prev))) { /* consolidate backward */ - if (p != m->dv) { - unlink_chunk(m, p, prevsize); - } - else if ((next->head & INUSE_BITS) == INUSE_BITS) { - m->dvsize = psize; - set_free_with_pinuse(p, psize, next); - return; - } - } - else { - CORRUPTION_ERROR_ACTION(m); - return; - } - } - if (RTCHECK(ok_address(m, next))) { - if (!cinuse(next)) { /* consolidate forward */ - if (next == m->top) { - size_t tsize = m->topsize += psize; - m->top = p; - p->head = tsize | PINUSE_BIT; - if (p == m->dv) { - m->dv = 0; - m->dvsize = 0; - } - return; - } - else if (next == m->dv) { - size_t dsize = m->dvsize += psize; - m->dv = p; - set_size_and_pinuse_of_free_chunk(p, dsize); - return; - } - else { - size_t nsize = chunksize(next); - psize += nsize; - unlink_chunk(m, next, nsize); - set_size_and_pinuse_of_free_chunk(p, psize); - if (p == m->dv) { - m->dvsize = psize; - return; - } - } - } - else { - set_free_with_pinuse(p, psize, next); - } - insert_chunk(m, p, psize); - } - else { - CORRUPTION_ERROR_ACTION(m); - } -} - -/* ---------------------------- malloc --------------------------- */ - -/* allocate a large request from the best fitting chunk in a treebin */ -static void* tmalloc_large(mstate m, size_t nb) { - tchunkptr v = 0; - size_t rsize = -nb; /* Unsigned negation */ - tchunkptr t; - bindex_t idx; - compute_tree_index(nb, idx); - if ((t = *treebin_at(m, idx)) != 0) { - /* Traverse tree for this bin looking for node with size == nb */ - size_t sizebits = nb << leftshift_for_tree_index(idx); - tchunkptr rst = 0; /* The deepest untaken right subtree */ - for (;;) { - tchunkptr rt; - size_t trem = chunksize(t) - nb; - if (trem < rsize) { - v = t; - if ((rsize = trem) == 0) - break; - } - rt = t->child[1]; - t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; - if (rt != 0 && rt != t) - rst = rt; - if (t == 0) { - t = rst; /* set t to least subtree holding sizes > nb */ - break; - } - sizebits <<= 1; - } - } - if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */ - binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap; - if (leftbits != 0) { - bindex_t i; - binmap_t leastbit = least_bit(leftbits); - compute_bit2idx(leastbit, i); - t = *treebin_at(m, i); - } - } - - while (t != 0) { /* find smallest of tree or subtree */ - size_t trem = chunksize(t) - nb; - if (trem < rsize) { - rsize = trem; - v = t; - } - t = leftmost_child(t); - } - - /* If dv is a better fit, return 0 so malloc will use it */ - if (v != 0 && rsize < (size_t)(m->dvsize - nb)) { - if (RTCHECK(ok_address(m, v))) { /* split */ - mchunkptr r = chunk_plus_offset(v, nb); - assert(chunksize(v) == rsize + nb); - if (RTCHECK(ok_next(v, r))) { - unlink_large_chunk(m, v); - if (rsize < MIN_CHUNK_SIZE) - set_inuse_and_pinuse(m, v, (rsize + nb)); - else { - set_size_and_pinuse_of_inuse_chunk(m, v, nb); - set_size_and_pinuse_of_free_chunk(r, rsize); - insert_chunk(m, r, rsize); - } - return chunk2mem(v); - } - } - CORRUPTION_ERROR_ACTION(m); - } - return 0; -} - -/* allocate a small request from the best fitting chunk in a treebin */ -static void* tmalloc_small(mstate m, size_t nb) { - tchunkptr t, v; - size_t rsize; - bindex_t i; - binmap_t leastbit = least_bit(m->treemap); - compute_bit2idx(leastbit, i); - v = t = *treebin_at(m, i); - rsize = chunksize(t) - nb; - - while ((t = leftmost_child(t)) != 0) { - size_t trem = chunksize(t) - nb; - if (trem < rsize) { - rsize = trem; - v = t; - } - } - - if (RTCHECK(ok_address(m, v))) { - mchunkptr r = chunk_plus_offset(v, nb); - assert(chunksize(v) == rsize + nb); - if (RTCHECK(ok_next(v, r))) { - unlink_large_chunk(m, v); - if (rsize < MIN_CHUNK_SIZE) - set_inuse_and_pinuse(m, v, (rsize + nb)); - else { - set_size_and_pinuse_of_inuse_chunk(m, v, nb); - set_size_and_pinuse_of_free_chunk(r, rsize); - replace_dv(m, r, rsize); - } - return chunk2mem(v); - } - } - - CORRUPTION_ERROR_ACTION(m); - return 0; -} - -#if !ONLY_MSPACES - -void* dlmalloc(size_t bytes) { - /* - Basic algorithm: - If a small request (< 256 bytes minus per-chunk overhead): - 1. If one exists, use a remainderless chunk in associated smallbin. - (Remainderless means that there are too few excess bytes to - represent as a chunk.) - 2. If it is big enough, use the dv chunk, which is normally the - chunk adjacent to the one used for the most recent small request. - 3. If one exists, split the smallest available chunk in a bin, - saving remainder in dv. - 4. If it is big enough, use the top chunk. - 5. If available, get memory from system and use it - Otherwise, for a large request: - 1. Find the smallest available binned chunk that fits, and use it - if it is better fitting than dv chunk, splitting if necessary. - 2. If better fitting than any binned chunk, use the dv chunk. - 3. If it is big enough, use the top chunk. - 4. If request size >= mmap threshold, try to directly mmap this chunk. - 5. If available, get memory from system and use it - - The ugly goto's here ensure that postaction occurs along all paths. - */ - -#if USE_LOCKS - ensure_initialization(); /* initialize in sys_alloc if not using locks */ -#endif - - if (!PREACTION(gm)) { - void* mem; - size_t nb; - if (bytes <= MAX_SMALL_REQUEST) { - bindex_t idx; - binmap_t smallbits; - nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); - idx = small_index(nb); - smallbits = gm->smallmap >> idx; - - if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ - mchunkptr b, p; - idx += ~smallbits & 1; /* Uses next bin if idx empty */ - b = smallbin_at(gm, idx); - p = b->fd; - assert(chunksize(p) == small_index2size(idx)); - unlink_first_small_chunk(gm, b, p, idx); - set_inuse_and_pinuse(gm, p, small_index2size(idx)); - mem = chunk2mem(p); - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - - else if (nb > gm->dvsize) { - if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ - mchunkptr b, p, r; - size_t rsize; - bindex_t i; - binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); - binmap_t leastbit = least_bit(leftbits); - compute_bit2idx(leastbit, i); - b = smallbin_at(gm, i); - p = b->fd; - assert(chunksize(p) == small_index2size(i)); - unlink_first_small_chunk(gm, b, p, i); - rsize = small_index2size(i) - nb; - /* Fit here cannot be remainderless if 4byte sizes */ - if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) - set_inuse_and_pinuse(gm, p, small_index2size(i)); - else { - set_size_and_pinuse_of_inuse_chunk(gm, p, nb); - r = chunk_plus_offset(p, nb); - set_size_and_pinuse_of_free_chunk(r, rsize); - replace_dv(gm, r, rsize); - } - mem = chunk2mem(p); - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - - else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) { - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - } - } - else if (bytes >= MAX_REQUEST) - nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ - else { - nb = pad_request(bytes); - if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) { - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - } - - if (nb <= gm->dvsize) { - size_t rsize = gm->dvsize - nb; - mchunkptr p = gm->dv; - if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ - mchunkptr r = gm->dv = chunk_plus_offset(p, nb); - gm->dvsize = rsize; - set_size_and_pinuse_of_free_chunk(r, rsize); - set_size_and_pinuse_of_inuse_chunk(gm, p, nb); - } - else { /* exhaust dv */ - size_t dvs = gm->dvsize; - gm->dvsize = 0; - gm->dv = 0; - set_inuse_and_pinuse(gm, p, dvs); - } - mem = chunk2mem(p); - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - - else if (nb < gm->topsize) { /* Split top */ - size_t rsize = gm->topsize -= nb; - mchunkptr p = gm->top; - mchunkptr r = gm->top = chunk_plus_offset(p, nb); - r->head = rsize | PINUSE_BIT; - set_size_and_pinuse_of_inuse_chunk(gm, p, nb); - mem = chunk2mem(p); - check_top_chunk(gm, gm->top); - check_malloced_chunk(gm, mem, nb); - goto postaction; - } - - mem = sys_alloc(gm, nb); - - postaction: - POSTACTION(gm); - return mem; - } - - return 0; -} - -/* ---------------------------- free --------------------------- */ - -void dlfree(void* mem) { - /* - Consolidate freed chunks with preceeding or succeeding bordering - free chunks, if they exist, and then place in a bin. Intermixed - with special cases for top, dv, mmapped chunks, and usage errors. - */ - - if (mem != 0) { - mchunkptr p = mem2chunk(mem); -#if FOOTERS - mstate fm = get_mstate_for(p); - if (!ok_magic(fm)) { - USAGE_ERROR_ACTION(fm, p); - return; - } -#else /* FOOTERS */ -#define fm gm -#endif /* FOOTERS */ - if (!PREACTION(fm)) { - check_inuse_chunk(fm, p); - if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) { - size_t psize = chunksize(p); - mchunkptr next = chunk_plus_offset(p, psize); - if (!pinuse(p)) { - size_t prevsize = p->prev_foot; - if (is_mmapped(p)) { - psize += prevsize + MMAP_FOOT_PAD; - if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) - fm->footprint -= psize; - goto postaction; - } - else { - mchunkptr prev = chunk_minus_offset(p, prevsize); - psize += prevsize; - p = prev; - if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ - if (p != fm->dv) { - unlink_chunk(fm, p, prevsize); - } - else if ((next->head & INUSE_BITS) == INUSE_BITS) { - fm->dvsize = psize; - set_free_with_pinuse(p, psize, next); - goto postaction; - } - } - else - goto erroraction; - } - } - - if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { - if (!cinuse(next)) { /* consolidate forward */ - if (next == fm->top) { - size_t tsize = fm->topsize += psize; - fm->top = p; - p->head = tsize | PINUSE_BIT; - if (p == fm->dv) { - fm->dv = 0; - fm->dvsize = 0; - } - if (should_trim(fm, tsize)) - sys_trim(fm, 0); - goto postaction; - } - else if (next == fm->dv) { - size_t dsize = fm->dvsize += psize; - fm->dv = p; - set_size_and_pinuse_of_free_chunk(p, dsize); - goto postaction; - } - else { - size_t nsize = chunksize(next); - psize += nsize; - unlink_chunk(fm, next, nsize); - set_size_and_pinuse_of_free_chunk(p, psize); - if (p == fm->dv) { - fm->dvsize = psize; - goto postaction; - } - } - } - else - set_free_with_pinuse(p, psize, next); - - if (is_small(psize)) { - insert_small_chunk(fm, p, psize); - check_free_chunk(fm, p); - } - else { - tchunkptr tp = (tchunkptr)p; - insert_large_chunk(fm, tp, psize); - check_free_chunk(fm, p); - if (--fm->release_checks == 0) - release_unused_segments(fm); - } - goto postaction; - } - } - erroraction: - USAGE_ERROR_ACTION(fm, p); - postaction: - POSTACTION(fm); - } - } -#if !FOOTERS -#undef fm -#endif /* FOOTERS */ -} - -void* dlcalloc(size_t n_elements, size_t elem_size) { - void* mem; - size_t req = 0; - if (n_elements != 0) { - req = n_elements * elem_size; - if (((n_elements | elem_size) & ~(size_t)0xffff) && - (req / n_elements != elem_size)) - req = MAX_SIZE_T; /* force downstream failure on overflow */ - } - mem = dlmalloc(req); - if (mem != 0 && calloc_must_clear(mem2chunk(mem))) - memset(mem, 0, req); - return mem; -} - -#endif /* !ONLY_MSPACES */ - -/* ------------ Internal support for realloc, memalign, etc -------------- */ - -/* Try to realloc; only in-place unless can_move true */ -static mchunkptr try_realloc_chunk(mstate m, mchunkptr p, size_t nb, - int can_move) { - mchunkptr newp = 0; - size_t oldsize = chunksize(p); - mchunkptr next = chunk_plus_offset(p, oldsize); - if (RTCHECK(ok_address(m, p) && ok_inuse(p) && - ok_next(p, next) && ok_pinuse(next))) { - if (is_mmapped(p)) { - newp = mmap_resize(m, p, nb, can_move); - } - else if (oldsize >= nb) { /* already big enough */ - size_t rsize = oldsize - nb; - if (rsize >= MIN_CHUNK_SIZE) { /* split off remainder */ - mchunkptr r = chunk_plus_offset(p, nb); - set_inuse(m, p, nb); - set_inuse(m, r, rsize); - dispose_chunk(m, r, rsize); - } - newp = p; - } - else if (next == m->top) { /* extend into top */ - if (oldsize + m->topsize > nb) { - size_t newsize = oldsize + m->topsize; - size_t newtopsize = newsize - nb; - mchunkptr newtop = chunk_plus_offset(p, nb); - set_inuse(m, p, nb); - newtop->head = newtopsize |PINUSE_BIT; - m->top = newtop; - m->topsize = newtopsize; - newp = p; - } - } - else if (next == m->dv) { /* extend into dv */ - size_t dvs = m->dvsize; - if (oldsize + dvs >= nb) { - size_t dsize = oldsize + dvs - nb; - if (dsize >= MIN_CHUNK_SIZE) { - mchunkptr r = chunk_plus_offset(p, nb); - mchunkptr n = chunk_plus_offset(r, dsize); - set_inuse(m, p, nb); - set_size_and_pinuse_of_free_chunk(r, dsize); - clear_pinuse(n); - m->dvsize = dsize; - m->dv = r; - } - else { /* exhaust dv */ - size_t newsize = oldsize + dvs; - set_inuse(m, p, newsize); - m->dvsize = 0; - m->dv = 0; - } - newp = p; - } - } - else if (!cinuse(next)) { /* extend into next free chunk */ - size_t nextsize = chunksize(next); - if (oldsize + nextsize >= nb) { - size_t rsize = oldsize + nextsize - nb; - unlink_chunk(m, next, nextsize); - if (rsize < MIN_CHUNK_SIZE) { - size_t newsize = oldsize + nextsize; - set_inuse(m, p, newsize); - } - else { - mchunkptr r = chunk_plus_offset(p, nb); - set_inuse(m, p, nb); - set_inuse(m, r, rsize); - dispose_chunk(m, r, rsize); - } - newp = p; - } - } - } - else { - USAGE_ERROR_ACTION(m, chunk2mem(p)); - } - return newp; -} - -static void* internal_memalign(mstate m, size_t alignment, size_t bytes) { - void* mem = 0; - if (alignment < MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */ - alignment = MIN_CHUNK_SIZE; - if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */ - size_t a = MALLOC_ALIGNMENT << 1; - while (a < alignment) a <<= 1; - alignment = a; - } - if (bytes >= MAX_REQUEST - alignment) { - if (m != 0) { /* Test isn't needed but avoids compiler warning */ - MALLOC_FAILURE_ACTION; - } - } - else { - size_t nb = request2size(bytes); - size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD; - mem = internal_malloc(m, req); - if (mem != 0) { - mchunkptr p = mem2chunk(mem); - if (PREACTION(m)) - return 0; - if ((((size_t)(mem)) & (alignment - 1)) != 0) { /* misaligned */ - /* - Find an aligned spot inside chunk. Since we need to give - back leading space in a chunk of at least MIN_CHUNK_SIZE, if - the first calculation places us at a spot with less than - MIN_CHUNK_SIZE leader, we can move to the next aligned spot. - We've allocated enough total room so that this is always - possible. - */ - char* br = (char*)mem2chunk((size_t)(((size_t)((char*)mem + alignment - - SIZE_T_ONE)) & - -alignment)); - char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)? - br : br+alignment; - mchunkptr newp = (mchunkptr)pos; - size_t leadsize = pos - (char*)(p); - size_t newsize = chunksize(p) - leadsize; - - if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */ - newp->prev_foot = p->prev_foot + leadsize; - newp->head = newsize; - } - else { /* Otherwise, give back leader, use the rest */ - set_inuse(m, newp, newsize); - set_inuse(m, p, leadsize); - dispose_chunk(m, p, leadsize); - } - p = newp; - } - - /* Give back spare room at the end */ - if (!is_mmapped(p)) { - size_t size = chunksize(p); - if (size > nb + MIN_CHUNK_SIZE) { - size_t remainder_size = size - nb; - mchunkptr remainder = chunk_plus_offset(p, nb); - set_inuse(m, p, nb); - set_inuse(m, remainder, remainder_size); - dispose_chunk(m, remainder, remainder_size); - } - } - - mem = chunk2mem(p); - assert (chunksize(p) >= nb); - assert(((size_t)mem & (alignment - 1)) == 0); - check_inuse_chunk(m, p); - POSTACTION(m); - } - } - return mem; -} - -/* - Common support for independent_X routines, handling - all of the combinations that can result. - The opts arg has: - bit 0 set if all elements are same size (using sizes[0]) - bit 1 set if elements should be zeroed -*/ -static void** ialloc(mstate m, - size_t n_elements, - size_t* sizes, - int opts, - void* chunks[]) { - - size_t element_size; /* chunksize of each element, if all same */ - size_t contents_size; /* total size of elements */ - size_t array_size; /* request size of pointer array */ - void* mem; /* malloced aggregate space */ - mchunkptr p; /* corresponding chunk */ - size_t remainder_size; /* remaining bytes while splitting */ - void** marray; /* either "chunks" or malloced ptr array */ - mchunkptr array_chunk; /* chunk for malloced ptr array */ - flag_t was_enabled; /* to disable mmap */ - size_t size; - size_t i; - - ensure_initialization(); - /* compute array length, if needed */ - if (chunks != 0) { - if (n_elements == 0) - return chunks; /* nothing to do */ - marray = chunks; - array_size = 0; - } - else { - /* if empty req, must still return chunk representing empty array */ - if (n_elements == 0) - return (void**)internal_malloc(m, 0); - marray = 0; - array_size = request2size(n_elements * (sizeof(void*))); - } - - /* compute total element size */ - if (opts & 0x1) { /* all-same-size */ - element_size = request2size(*sizes); - contents_size = n_elements * element_size; - } - else { /* add up all the sizes */ - element_size = 0; - contents_size = 0; - for (i = 0; i != n_elements; ++i) - contents_size += request2size(sizes[i]); - } - - size = contents_size + array_size; - - /* - Allocate the aggregate chunk. First disable direct-mmapping so - malloc won't use it, since we would not be able to later - free/realloc space internal to a segregated mmap region. - */ - was_enabled = use_mmap(m); - disable_mmap(m); - mem = internal_malloc(m, size - CHUNK_OVERHEAD); - if (was_enabled) - enable_mmap(m); - if (mem == 0) - return 0; - - if (PREACTION(m)) return 0; - p = mem2chunk(mem); - remainder_size = chunksize(p); - - assert(!is_mmapped(p)); - - if (opts & 0x2) { /* optionally clear the elements */ - memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size); - } - - /* If not provided, allocate the pointer array as final part of chunk */ - if (marray == 0) { - size_t array_chunk_size; - array_chunk = chunk_plus_offset(p, contents_size); - array_chunk_size = remainder_size - contents_size; - marray = (void**) (chunk2mem(array_chunk)); - set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size); - remainder_size = contents_size; - } - - /* split out elements */ - for (i = 0; ; ++i) { - marray[i] = chunk2mem(p); - if (i != n_elements-1) { - if (element_size != 0) - size = element_size; - else - size = request2size(sizes[i]); - remainder_size -= size; - set_size_and_pinuse_of_inuse_chunk(m, p, size); - p = chunk_plus_offset(p, size); - } - else { /* the final element absorbs any overallocation slop */ - set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size); - break; - } - } - -#if DEBUG - if (marray != chunks) { - /* final element must have exactly exhausted chunk */ - if (element_size != 0) { - assert(remainder_size == element_size); - } - else { - assert(remainder_size == request2size(sizes[i])); - } - check_inuse_chunk(m, mem2chunk(marray)); - } - for (i = 0; i != n_elements; ++i) - check_inuse_chunk(m, mem2chunk(marray[i])); - -#endif /* DEBUG */ - - POSTACTION(m); - return marray; -} - -/* Try to free all pointers in the given array. - Note: this could be made faster, by delaying consolidation, - at the price of disabling some user integrity checks, We - still optimize some consolidations by combining adjacent - chunks before freeing, which will occur often if allocated - with ialloc or the array is sorted. -*/ -static size_t internal_bulk_free(mstate m, void* array[], size_t nelem) { - size_t unfreed = 0; - if (!PREACTION(m)) { - void** a; - void** fence = &(array[nelem]); - for (a = array; a != fence; ++a) { - void* mem = *a; - if (mem != 0) { - mchunkptr p = mem2chunk(mem); - size_t psize = chunksize(p); -#if FOOTERS - if (get_mstate_for(p) != m) { - ++unfreed; - continue; - } -#endif - check_inuse_chunk(m, p); - *a = 0; - if (RTCHECK(ok_address(m, p) && ok_inuse(p))) { - void ** b = a + 1; /* try to merge with next chunk */ - mchunkptr next = next_chunk(p); - if (b != fence && *b == chunk2mem(next)) { - size_t newsize = chunksize(next) + psize; - set_inuse(m, p, newsize); - *b = chunk2mem(p); - } - else - dispose_chunk(m, p, psize); - } - else { - CORRUPTION_ERROR_ACTION(m); - break; - } - } - } - if (should_trim(m, m->topsize)) - sys_trim(m, 0); - POSTACTION(m); - } - return unfreed; -} - -/* Traversal */ -#if MALLOC_INSPECT_ALL -static void internal_inspect_all(mstate m, - void(*handler)(void *start, - void *end, - size_t used_bytes, - void* callback_arg), - void* arg) { - if (is_initialized(m)) { - mchunkptr top = m->top; - msegmentptr s; - for (s = &m->seg; s != 0; s = s->next) { - mchunkptr q = align_as_chunk(s->base); - while (segment_holds(s, q) && q->head != FENCEPOST_HEAD) { - mchunkptr next = next_chunk(q); - size_t sz = chunksize(q); - size_t used; - void* start; - if (is_inuse(q)) { - used = sz - CHUNK_OVERHEAD; /* must not be mmapped */ - start = chunk2mem(q); - } - else { - used = 0; - if (is_small(sz)) { /* offset by possible bookkeeping */ - start = (void*)((char*)q + sizeof(struct malloc_chunk)); - } - else { - start = (void*)((char*)q + sizeof(struct malloc_tree_chunk)); - } - } - if (start < (void*)next) /* skip if all space is bookkeeping */ - handler(start, next, used, arg); - if (q == top) - break; - q = next; - } - } - } -} -#endif /* MALLOC_INSPECT_ALL */ - -/* ------------------ Exported realloc, memalign, etc -------------------- */ - -#if !ONLY_MSPACES - -void* dlrealloc(void* oldmem, size_t bytes) { - void* mem = 0; - if (oldmem == 0) { - mem = dlmalloc(bytes); - } - else if (bytes >= MAX_REQUEST) { - MALLOC_FAILURE_ACTION; - } -#ifdef REALLOC_ZERO_BYTES_FREES - else if (bytes == 0) { - dlfree(oldmem); - } -#endif /* REALLOC_ZERO_BYTES_FREES */ - else { - size_t nb = request2size(bytes); - mchunkptr oldp = mem2chunk(oldmem); -#if ! FOOTERS - mstate m = gm; -#else /* FOOTERS */ - mstate m = get_mstate_for(oldp); - if (!ok_magic(m)) { - USAGE_ERROR_ACTION(m, oldmem); - return 0; - } -#endif /* FOOTERS */ - if (!PREACTION(m)) { - mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1); - POSTACTION(m); - if (newp != 0) { - check_inuse_chunk(m, newp); - mem = chunk2mem(newp); - } - else { - mem = internal_malloc(m, bytes); - if (mem != 0) { - size_t oc = chunksize(oldp) - overhead_for(oldp); - memcpy(mem, oldmem, (oc < bytes)? oc : bytes); - internal_free(m, oldmem); - } - } - } - } - return mem; -} - -void* dlrealloc_in_place(void* oldmem, size_t bytes) { - void* mem = 0; - if (oldmem != 0) { - if (bytes >= MAX_REQUEST) { - MALLOC_FAILURE_ACTION; - } - else { - size_t nb = request2size(bytes); - mchunkptr oldp = mem2chunk(oldmem); -#if ! FOOTERS - mstate m = gm; -#else /* FOOTERS */ - mstate m = get_mstate_for(oldp); - if (!ok_magic(m)) { - USAGE_ERROR_ACTION(m, oldmem); - return 0; - } -#endif /* FOOTERS */ - if (!PREACTION(m)) { - mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0); - POSTACTION(m); - if (newp == oldp) { - check_inuse_chunk(m, newp); - mem = oldmem; - } - } - } - } - return mem; -} - -void* dlmemalign(size_t alignment, size_t bytes) { - if (alignment <= MALLOC_ALIGNMENT) { - return dlmalloc(bytes); - } - return internal_memalign(gm, alignment, bytes); -} - -int dlposix_memalign(void** pp, size_t alignment, size_t bytes) { - void* mem = 0; - if (alignment == MALLOC_ALIGNMENT) - mem = dlmalloc(bytes); - else { - size_t d = alignment / sizeof(void*); - size_t r = alignment % sizeof(void*); - if (r != 0 || d == 0 || (d & (d-SIZE_T_ONE)) != 0) - return EINVAL; - else if (bytes <= MAX_REQUEST - alignment) { - if (alignment < MIN_CHUNK_SIZE) - alignment = MIN_CHUNK_SIZE; - mem = internal_memalign(gm, alignment, bytes); - } - } - if (mem == 0) - return ENOMEM; - else { - *pp = mem; - return 0; - } -} - -void* dlvalloc(size_t bytes) { - size_t pagesz; - ensure_initialization(); - pagesz = mparams.page_size; - return dlmemalign(pagesz, bytes); -} - -void* dlpvalloc(size_t bytes) { - size_t pagesz; - ensure_initialization(); - pagesz = mparams.page_size; - return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE)); -} - -void** dlindependent_calloc(size_t n_elements, size_t elem_size, - void* chunks[]) { - size_t sz = elem_size; /* serves as 1-element array */ - return ialloc(gm, n_elements, &sz, 3, chunks); -} - -void** dlindependent_comalloc(size_t n_elements, size_t sizes[], - void* chunks[]) { - return ialloc(gm, n_elements, sizes, 0, chunks); -} - -size_t dlbulk_free(void* array[], size_t nelem) { - return internal_bulk_free(gm, array, nelem); -} - -#if MALLOC_INSPECT_ALL -void dlmalloc_inspect_all(void(*handler)(void *start, - void *end, - size_t used_bytes, - void* callback_arg), - void* arg) { - ensure_initialization(); - if (!PREACTION(gm)) { - internal_inspect_all(gm, handler, arg); - POSTACTION(gm); - } -} -#endif /* MALLOC_INSPECT_ALL */ - -int dlmalloc_trim(size_t pad) { - int result = 0; - ensure_initialization(); - if (!PREACTION(gm)) { - result = sys_trim(gm, pad); - POSTACTION(gm); - } - return result; -} - -size_t dlmalloc_footprint(void) { - return gm->footprint; -} - -size_t dlmalloc_max_footprint(void) { - return gm->max_footprint; -} - -size_t dlmalloc_footprint_limit(void) { - size_t maf = gm->footprint_limit; - return maf == 0 ? MAX_SIZE_T : maf; -} - -size_t dlmalloc_set_footprint_limit(size_t bytes) { - size_t result; /* invert sense of 0 */ - if (bytes == 0) - result = granularity_align(1); /* Use minimal size */ - if (bytes == MAX_SIZE_T) - result = 0; /* disable */ - else - result = granularity_align(bytes); - return gm->footprint_limit = result; -} - -#if !NO_MALLINFO -struct mallinfo dlmallinfo(void) { - return internal_mallinfo(gm); -} -#endif /* NO_MALLINFO */ - -#if !NO_MALLOC_STATS -void dlmalloc_stats() { - internal_malloc_stats(gm); -} -#endif /* NO_MALLOC_STATS */ - -int dlmallopt(int param_number, int value) { - return change_mparam(param_number, value); -} - -size_t dlmalloc_usable_size(void* mem) { - if (mem != 0) { - mchunkptr p = mem2chunk(mem); - if (is_inuse(p)) - return chunksize(p) - overhead_for(p); - } - return 0; -} - -#endif /* !ONLY_MSPACES */ - -/* ----------------------------- user mspaces ---------------------------- */ - -#if MSPACES - -static mstate init_user_mstate(char* tbase, size_t tsize) { - size_t msize = pad_request(sizeof(struct malloc_state)); - mchunkptr mn; - mchunkptr msp = align_as_chunk(tbase); - mstate m = (mstate)(chunk2mem(msp)); - memset(m, 0, msize); - (void)INITIAL_LOCK(&m->mutex); - msp->head = (msize|INUSE_BITS); - m->seg.base = m->least_addr = tbase; - m->seg.size = m->footprint = m->max_footprint = tsize; - m->magic = mparams.magic; - m->release_checks = MAX_RELEASE_CHECK_RATE; - m->mflags = mparams.default_mflags; - m->extp = 0; - m->exts = 0; - disable_contiguous(m); - init_bins(m); - mn = next_chunk(mem2chunk(m)); - init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE); - check_top_chunk(m, m->top); - return m; -} - -static mstate init_device_mstate(char* tbase, char* dbase, size_t tsize) { - size_t msize = pad_request(sizeof(struct malloc_state)); - mchunkptr mn; - mchunkptr msp = align_as_chunk(tbase); - mstate m = (mstate)(chunk2mem(msp)); - //memset(m, 0, msize); - (void)INITIAL_LOCK(&m->mutex); - msp->head = (msize|INUSE_BITS); - m->seg.base = m->least_addr = dbase; - m->seg.size = m->footprint = m->max_footprint = tsize; - m->magic = mparams.magic; - m->release_checks = MAX_RELEASE_CHECK_RATE; - m->mflags = mparams.default_mflags; - m->extp = 0; - m->exts = 0; - disable_contiguous(m); - init_bins(m); - mn = next_chunk(mem2chunk(m)); -printf("BBBBBBBBBBBBBB\n"); - init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE); - //check_top_chunk(m, m->top); - return m; -} - - -mspace create_mspace(size_t capacity, int locked) { - mstate m = 0; - size_t msize; - ensure_initialization(); - msize = pad_request(sizeof(struct malloc_state)); - if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { - size_t rs = ((capacity == 0)? mparams.granularity : - (capacity + TOP_FOOT_SIZE + msize)); - size_t tsize = granularity_align(rs); - char* tbase = (char*)(CALL_MMAP(tsize)); - if (tbase != CMFAIL) { - m = init_user_mstate(tbase, tsize); - m->seg.sflags = USE_MMAP_BIT; - set_lock(m, locked); - } - } - return (mspace)m; -} - -mspace create_mspace_with_base(void* base, size_t capacity, int locked) { - mstate m = 0; - size_t msize; - ensure_initialization(); - msize = pad_request(sizeof(struct malloc_state)); - if (capacity > msize + TOP_FOOT_SIZE && - capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { - m = init_user_mstate((char*)base, capacity); - m->seg.sflags = EXTERN_BIT; - set_lock(m, locked); - } - return (mspace)m; -} - -mspace create_device_mspace_with_base(void* base, size_t capacity, int locked) { - mstate m = 0; - size_t msize; - //ensure_initialization(); - msize = pad_request(sizeof(struct malloc_state)); - void* host_base; - host_base= malloc(msize + TOP_FOOT_SIZE+mparams.page_size); - if (capacity > msize + TOP_FOOT_SIZE && - capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { - m = init_device_mstate((char*)host_base, (char*)base, capacity); -printf("AAAAAAAAA\n"); - m->seg.sflags = EXTERN_BIT; - set_lock(m, locked); - } - return (mspace)m; -} - - -int mspace_track_large_chunks(mspace msp, int enable) { - int ret = 0; - mstate ms = (mstate)msp; - if (!PREACTION(ms)) { - if (!use_mmap(ms)) { - ret = 1; - } - if (!enable) { - enable_mmap(ms); - } else { - disable_mmap(ms); - } - POSTACTION(ms); - } - return ret; -} - -size_t destroy_mspace(mspace msp) { - size_t freed = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - msegmentptr sp = &ms->seg; - (void)DESTROY_LOCK(&ms->mutex); /* destroy before unmapped */ - while (sp != 0) { - char* base = sp->base; - size_t size = sp->size; - flag_t flag = sp->sflags; - (void)base; /* placate people compiling -Wunused-variable */ - sp = sp->next; - if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) && - CALL_MUNMAP(base, size) == 0) - freed += size; - } - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return freed; -} - -/* - mspace versions of routines are near-clones of the global - versions. This is not so nice but better than the alternatives. -*/ - -void* mspace_malloc(mspace msp, size_t bytes) { - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - return 0; - } - if (!PREACTION(ms)) { - void* mem; - size_t nb; - if (bytes <= MAX_SMALL_REQUEST) { - bindex_t idx; - binmap_t smallbits; - nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); - idx = small_index(nb); - smallbits = ms->smallmap >> idx; - - if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ - mchunkptr b, p; - idx += ~smallbits & 1; /* Uses next bin if idx empty */ - b = smallbin_at(ms, idx); - p = b->fd; - assert(chunksize(p) == small_index2size(idx)); - unlink_first_small_chunk(ms, b, p, idx); - set_inuse_and_pinuse(ms, p, small_index2size(idx)); - mem = chunk2mem(p); - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - - else if (nb > ms->dvsize) { - if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ - mchunkptr b, p, r; - size_t rsize; - bindex_t i; - binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); - binmap_t leastbit = least_bit(leftbits); - compute_bit2idx(leastbit, i); - b = smallbin_at(ms, i); - p = b->fd; - assert(chunksize(p) == small_index2size(i)); - unlink_first_small_chunk(ms, b, p, i); - rsize = small_index2size(i) - nb; - /* Fit here cannot be remainderless if 4byte sizes */ - if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) - set_inuse_and_pinuse(ms, p, small_index2size(i)); - else { - set_size_and_pinuse_of_inuse_chunk(ms, p, nb); - r = chunk_plus_offset(p, nb); - set_size_and_pinuse_of_free_chunk(r, rsize); - replace_dv(ms, r, rsize); - } - mem = chunk2mem(p); - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - - else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) { - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - } - } - else if (bytes >= MAX_REQUEST) - nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ - else { - nb = pad_request(bytes); - if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) { - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - } - - if (nb <= ms->dvsize) { - size_t rsize = ms->dvsize - nb; - mchunkptr p = ms->dv; - if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ - mchunkptr r = ms->dv = chunk_plus_offset(p, nb); - ms->dvsize = rsize; - set_size_and_pinuse_of_free_chunk(r, rsize); - set_size_and_pinuse_of_inuse_chunk(ms, p, nb); - } - else { /* exhaust dv */ - size_t dvs = ms->dvsize; - ms->dvsize = 0; - ms->dv = 0; - set_inuse_and_pinuse(ms, p, dvs); - } - mem = chunk2mem(p); - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - - else if (nb < ms->topsize) { /* Split top */ - size_t rsize = ms->topsize -= nb; - mchunkptr p = ms->top; - mchunkptr r = ms->top = chunk_plus_offset(p, nb); - r->head = rsize | PINUSE_BIT; - set_size_and_pinuse_of_inuse_chunk(ms, p, nb); - mem = chunk2mem(p); - check_top_chunk(ms, ms->top); - check_malloced_chunk(ms, mem, nb); - goto postaction; - } - - mem = sys_alloc(ms, nb); - - postaction: - POSTACTION(ms); - return mem; - } - - return 0; -} - -void mspace_free(mspace msp, void* mem) { - if (mem != 0) { - mchunkptr p = mem2chunk(mem); -#if FOOTERS - mstate fm = get_mstate_for(p); - (void)msp; /* placate people compiling -Wunused */ -#else /* FOOTERS */ - mstate fm = (mstate)msp; -#endif /* FOOTERS */ - if (!ok_magic(fm)) { - USAGE_ERROR_ACTION(fm, p); - return; - } - if (!PREACTION(fm)) { - check_inuse_chunk(fm, p); - if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) { - size_t psize = chunksize(p); - mchunkptr next = chunk_plus_offset(p, psize); - if (!pinuse(p)) { - size_t prevsize = p->prev_foot; - if (is_mmapped(p)) { - psize += prevsize + MMAP_FOOT_PAD; - if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) - fm->footprint -= psize; - goto postaction; - } - else { - mchunkptr prev = chunk_minus_offset(p, prevsize); - psize += prevsize; - p = prev; - if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ - if (p != fm->dv) { - unlink_chunk(fm, p, prevsize); - } - else if ((next->head & INUSE_BITS) == INUSE_BITS) { - fm->dvsize = psize; - set_free_with_pinuse(p, psize, next); - goto postaction; - } - } - else - goto erroraction; - } - } - - if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { - if (!cinuse(next)) { /* consolidate forward */ - if (next == fm->top) { - size_t tsize = fm->topsize += psize; - fm->top = p; - p->head = tsize | PINUSE_BIT; - if (p == fm->dv) { - fm->dv = 0; - fm->dvsize = 0; - } - if (should_trim(fm, tsize)) - sys_trim(fm, 0); - goto postaction; - } - else if (next == fm->dv) { - size_t dsize = fm->dvsize += psize; - fm->dv = p; - set_size_and_pinuse_of_free_chunk(p, dsize); - goto postaction; - } - else { - size_t nsize = chunksize(next); - psize += nsize; - unlink_chunk(fm, next, nsize); - set_size_and_pinuse_of_free_chunk(p, psize); - if (p == fm->dv) { - fm->dvsize = psize; - goto postaction; - } - } - } - else - set_free_with_pinuse(p, psize, next); - - if (is_small(psize)) { - insert_small_chunk(fm, p, psize); - check_free_chunk(fm, p); - } - else { - tchunkptr tp = (tchunkptr)p; - insert_large_chunk(fm, tp, psize); - check_free_chunk(fm, p); - if (--fm->release_checks == 0) - release_unused_segments(fm); - } - goto postaction; - } - } - erroraction: - USAGE_ERROR_ACTION(fm, p); - postaction: - POSTACTION(fm); - } - } -} - -void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) { - void* mem; - size_t req = 0; - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - return 0; - } - if (n_elements != 0) { - req = n_elements * elem_size; - if (((n_elements | elem_size) & ~(size_t)0xffff) && - (req / n_elements != elem_size)) - req = MAX_SIZE_T; /* force downstream failure on overflow */ - } - mem = internal_malloc(ms, req); - if (mem != 0 && calloc_must_clear(mem2chunk(mem))) - memset(mem, 0, req); - return mem; -} - -void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) { - void* mem = 0; - if (oldmem == 0) { - mem = mspace_malloc(msp, bytes); - } - else if (bytes >= MAX_REQUEST) { - MALLOC_FAILURE_ACTION; - } -#ifdef REALLOC_ZERO_BYTES_FREES - else if (bytes == 0) { - mspace_free(msp, oldmem); - } -#endif /* REALLOC_ZERO_BYTES_FREES */ - else { - size_t nb = request2size(bytes); - mchunkptr oldp = mem2chunk(oldmem); -#if ! FOOTERS - mstate m = (mstate)msp; -#else /* FOOTERS */ - mstate m = get_mstate_for(oldp); - if (!ok_magic(m)) { - USAGE_ERROR_ACTION(m, oldmem); - return 0; - } -#endif /* FOOTERS */ - if (!PREACTION(m)) { - mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1); - POSTACTION(m); - if (newp != 0) { - check_inuse_chunk(m, newp); - mem = chunk2mem(newp); - } - else { - mem = mspace_malloc(m, bytes); - if (mem != 0) { - size_t oc = chunksize(oldp) - overhead_for(oldp); - memcpy(mem, oldmem, (oc < bytes)? oc : bytes); - mspace_free(m, oldmem); - } - } - } - } - return mem; -} - -void* mspace_realloc_in_place(mspace msp, void* oldmem, size_t bytes) { - void* mem = 0; - if (oldmem != 0) { - if (bytes >= MAX_REQUEST) { - MALLOC_FAILURE_ACTION; - } - else { - size_t nb = request2size(bytes); - mchunkptr oldp = mem2chunk(oldmem); -#if ! FOOTERS - mstate m = (mstate)msp; -#else /* FOOTERS */ - mstate m = get_mstate_for(oldp); - (void)msp; /* placate people compiling -Wunused */ - if (!ok_magic(m)) { - USAGE_ERROR_ACTION(m, oldmem); - return 0; - } -#endif /* FOOTERS */ - if (!PREACTION(m)) { - mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0); - POSTACTION(m); - if (newp == oldp) { - check_inuse_chunk(m, newp); - mem = oldmem; - } - } - } - } - return mem; -} - -void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) { - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - return 0; - } - if (alignment <= MALLOC_ALIGNMENT) - return mspace_malloc(msp, bytes); - return internal_memalign(ms, alignment, bytes); -} - -void** mspace_independent_calloc(mspace msp, size_t n_elements, - size_t elem_size, void* chunks[]) { - size_t sz = elem_size; /* serves as 1-element array */ - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - return 0; - } - return ialloc(ms, n_elements, &sz, 3, chunks); -} - -void** mspace_independent_comalloc(mspace msp, size_t n_elements, - size_t sizes[], void* chunks[]) { - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - return 0; - } - return ialloc(ms, n_elements, sizes, 0, chunks); -} - -size_t mspace_bulk_free(mspace msp, void* array[], size_t nelem) { - return internal_bulk_free((mstate)msp, array, nelem); -} - -#if MALLOC_INSPECT_ALL -void mspace_inspect_all(mspace msp, - void(*handler)(void *start, - void *end, - size_t used_bytes, - void* callback_arg), - void* arg) { - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - if (!PREACTION(ms)) { - internal_inspect_all(ms, handler, arg); - POSTACTION(ms); - } - } - else { - USAGE_ERROR_ACTION(ms,ms); - } -} -#endif /* MALLOC_INSPECT_ALL */ - -int mspace_trim(mspace msp, size_t pad) { - int result = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - if (!PREACTION(ms)) { - result = sys_trim(ms, pad); - POSTACTION(ms); - } - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return result; -} - -#if !NO_MALLOC_STATS -void mspace_malloc_stats(mspace msp) { - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - internal_malloc_stats(ms); - } - else { - USAGE_ERROR_ACTION(ms,ms); - } -} -#endif /* NO_MALLOC_STATS */ - -size_t mspace_footprint(mspace msp) { - size_t result = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - result = ms->footprint; - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return result; -} - -size_t mspace_max_footprint(mspace msp) { - size_t result = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - result = ms->max_footprint; - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return result; -} - -size_t mspace_footprint_limit(mspace msp) { - size_t result = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - size_t maf = ms->footprint_limit; - result = (maf == 0) ? MAX_SIZE_T : maf; - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return result; -} - -size_t mspace_set_footprint_limit(mspace msp, size_t bytes) { - size_t result = 0; - mstate ms = (mstate)msp; - if (ok_magic(ms)) { - if (bytes == 0) - result = granularity_align(1); /* Use minimal size */ - if (bytes == MAX_SIZE_T) - result = 0; /* disable */ - else - result = granularity_align(bytes); - ms->footprint_limit = result; - } - else { - USAGE_ERROR_ACTION(ms,ms); - } - return result; -} - -#if !NO_MALLINFO -struct mallinfo mspace_mallinfo(mspace msp) { - mstate ms = (mstate)msp; - if (!ok_magic(ms)) { - USAGE_ERROR_ACTION(ms,ms); - } - return internal_mallinfo(ms); -} -#endif /* NO_MALLINFO */ - -size_t mspace_usable_size(const void* mem) { - if (mem != 0) { - mchunkptr p = mem2chunk(mem); - if (is_inuse(p)) - return chunksize(p) - overhead_for(p); - } - return 0; -} - -int mspace_mallopt(int param_number, int value) { - return change_mparam(param_number, value); -} - -#endif /* MSPACES */ - - -/* -------------------- Alternative MORECORE functions ------------------- */ - -/* - Guidelines for creating a custom version of MORECORE: - - * For best performance, MORECORE should allocate in multiples of pagesize. - * MORECORE may allocate more memory than requested. (Or even less, - but this will usually result in a malloc failure.) - * MORECORE must not allocate memory when given argument zero, but - instead return one past the end address of memory from previous - nonzero call. - * For best performance, consecutive calls to MORECORE with positive - arguments should return increasing addresses, indicating that - space has been contiguously extended. - * Even though consecutive calls to MORECORE need not return contiguous - addresses, it must be OK for malloc'ed chunks to span multiple - regions in those cases where they do happen to be contiguous. - * MORECORE need not handle negative arguments -- it may instead - just return MFAIL when given negative arguments. - Negative arguments are always multiples of pagesize. MORECORE - must not misinterpret negative args as large positive unsigned - args. You can suppress all such calls from even occurring by defining - MORECORE_CANNOT_TRIM, - - As an example alternative MORECORE, here is a custom allocator - kindly contributed for pre-OSX macOS. It uses virtually but not - necessarily physically contiguous non-paged memory (locked in, - present and won't get swapped out). You can use it by uncommenting - this section, adding some #includes, and setting up the appropriate - defines above: - - #define MORECORE osMoreCore - - There is also a shutdown routine that should somehow be called for - cleanup upon program exit. - - #define MAX_POOL_ENTRIES 100 - #define MINIMUM_MORECORE_SIZE (64 * 1024U) - static int next_os_pool; - void *our_os_pools[MAX_POOL_ENTRIES]; - - void *osMoreCore(int size) - { - void *ptr = 0; - static void *sbrk_top = 0; - - if (size > 0) - { - if (size < MINIMUM_MORECORE_SIZE) - size = MINIMUM_MORECORE_SIZE; - if (CurrentExecutionLevel() == kTaskLevel) - ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0); - if (ptr == 0) - { - return (void *) MFAIL; - } - // save ptrs so they can be freed during cleanup - our_os_pools[next_os_pool] = ptr; - next_os_pool++; - ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK); - sbrk_top = (char *) ptr + size; - return ptr; - } - else if (size < 0) - { - // we don't currently support shrink behavior - return (void *) MFAIL; - } - else - { - return sbrk_top; - } - } - - // cleanup any allocated memory pools - // called as last thing before shutting down driver - - void osCleanupMem(void) - { - void **ptr; - - for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++) - if (*ptr) - { - PoolDeallocate(*ptr); - *ptr = 0; - } - } - -*/ - - -/* ----------------------------------------------------------------------- -History: - v2.8.6 Wed Aug 29 06:57:58 2012 Doug Lea - * fix bad comparison in dlposix_memalign - * don't reuse adjusted asize in sys_alloc - * add LOCK_AT_FORK -- thanks to Kirill Artamonov for the suggestion - * reduce compiler warnings -- thanks to all who reported/suggested these - - v2.8.5 Sun May 22 10:26:02 2011 Doug Lea (dl at gee) - * Always perform unlink checks unless INSECURE - * Add posix_memalign. - * Improve realloc to expand in more cases; expose realloc_in_place. - Thanks to Peter Buhr for the suggestion. - * Add footprint_limit, inspect_all, bulk_free. Thanks - to Barry Hayes and others for the suggestions. - * Internal refactorings to avoid calls while holding locks - * Use non-reentrant locks by default. Thanks to Roland McGrath - for the suggestion. - * Small fixes to mspace_destroy, reset_on_error. - * Various configuration extensions/changes. Thanks - to all who contributed these. - - V2.8.4a Thu Apr 28 14:39:43 2011 (dl at gee.cs.oswego.edu) - * Update Creative Commons URL - - V2.8.4 Wed May 27 09:56:23 2009 Doug Lea (dl at gee) - * Use zeros instead of prev foot for is_mmapped - * Add mspace_track_large_chunks; thanks to Jean Brouwers - * Fix set_inuse in internal_realloc; thanks to Jean Brouwers - * Fix insufficient sys_alloc padding when using 16byte alignment - * Fix bad error check in mspace_footprint - * Adaptations for ptmalloc; thanks to Wolfram Gloger. - * Reentrant spin locks; thanks to Earl Chew and others - * Win32 improvements; thanks to Niall Douglas and Earl Chew - * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options - * Extension hook in malloc_state - * Various small adjustments to reduce warnings on some compilers - * Various configuration extensions/changes for more platforms. Thanks - to all who contributed these. - - V2.8.3 Thu Sep 22 11:16:32 2005 Doug Lea (dl at gee) - * Add max_footprint functions - * Ensure all appropriate literals are size_t - * Fix conditional compilation problem for some #define settings - * Avoid concatenating segments with the one provided - in create_mspace_with_base - * Rename some variables to avoid compiler shadowing warnings - * Use explicit lock initialization. - * Better handling of sbrk interference. - * Simplify and fix segment insertion, trimming and mspace_destroy - * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x - * Thanks especially to Dennis Flanagan for help on these. - - V2.8.2 Sun Jun 12 16:01:10 2005 Doug Lea (dl at gee) - * Fix memalign brace error. - - V2.8.1 Wed Jun 8 16:11:46 2005 Doug Lea (dl at gee) - * Fix improper #endif nesting in C++ - * Add explicit casts needed for C++ - - V2.8.0 Mon May 30 14:09:02 2005 Doug Lea (dl at gee) - * Use trees for large bins - * Support mspaces - * Use segments to unify sbrk-based and mmap-based system allocation, - removing need for emulation on most platforms without sbrk. - * Default safety checks - * Optional footer checks. Thanks to William Robertson for the idea. - * Internal code refactoring - * Incorporate suggestions and platform-specific changes. - Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas, - Aaron Bachmann, Emery Berger, and others. - * Speed up non-fastbin processing enough to remove fastbins. - * Remove useless cfree() to avoid conflicts with other apps. - * Remove internal memcpy, memset. Compilers handle builtins better. - * Remove some options that no one ever used and rename others. - - V2.7.2 Sat Aug 17 09:07:30 2002 Doug Lea (dl at gee) - * Fix malloc_state bitmap array misdeclaration - - V2.7.1 Thu Jul 25 10:58:03 2002 Doug Lea (dl at gee) - * Allow tuning of FIRST_SORTED_BIN_SIZE - * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte. - * Better detection and support for non-contiguousness of MORECORE. - Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger - * Bypass most of malloc if no frees. Thanks To Emery Berger. - * Fix freeing of old top non-contiguous chunk im sysmalloc. - * Raised default trim and map thresholds to 256K. - * Fix mmap-related #defines. Thanks to Lubos Lunak. - * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield. - * Branch-free bin calculation - * Default trim and mmap thresholds now 256K. - - V2.7.0 Sun Mar 11 14:14:06 2001 Doug Lea (dl at gee) - * Introduce independent_comalloc and independent_calloc. - Thanks to Michael Pachos for motivation and help. - * Make optional .h file available - * Allow > 2GB requests on 32bit systems. - * new WIN32 sbrk, mmap, munmap, lock code from . - Thanks also to Andreas Mueller , - and Anonymous. - * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for - helping test this.) - * memalign: check alignment arg - * realloc: don't try to shift chunks backwards, since this - leads to more fragmentation in some programs and doesn't - seem to help in any others. - * Collect all cases in malloc requiring system memory into sysmalloc - * Use mmap as backup to sbrk - * Place all internal state in malloc_state - * Introduce fastbins (although similar to 2.5.1) - * Many minor tunings and cosmetic improvements - * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK - * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS - Thanks to Tony E. Bennett and others. - * Include errno.h to support default failure action. - - V2.6.6 Sun Dec 5 07:42:19 1999 Doug Lea (dl at gee) - * return null for negative arguments - * Added Several WIN32 cleanups from Martin C. Fong - * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h' - (e.g. WIN32 platforms) - * Cleanup header file inclusion for WIN32 platforms - * Cleanup code to avoid Microsoft Visual C++ compiler complaints - * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing - memory allocation routines - * Set 'malloc_getpagesize' for WIN32 platforms (needs more work) - * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to - usage of 'assert' in non-WIN32 code - * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to - avoid infinite loop - * Always call 'fREe()' rather than 'free()' - - V2.6.5 Wed Jun 17 15:57:31 1998 Doug Lea (dl at gee) - * Fixed ordering problem with boundary-stamping - - V2.6.3 Sun May 19 08:17:58 1996 Doug Lea (dl at gee) - * Added pvalloc, as recommended by H.J. Liu - * Added 64bit pointer support mainly from Wolfram Gloger - * Added anonymously donated WIN32 sbrk emulation - * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen - * malloc_extend_top: fix mask error that caused wastage after - foreign sbrks - * Add linux mremap support code from HJ Liu - - V2.6.2 Tue Dec 5 06:52:55 1995 Doug Lea (dl at gee) - * Integrated most documentation with the code. - * Add support for mmap, with help from - Wolfram Gloger (Gloger@lrz.uni-muenchen.de). - * Use last_remainder in more cases. - * Pack bins using idea from colin@nyx10.cs.du.edu - * Use ordered bins instead of best-fit threshhold - * Eliminate block-local decls to simplify tracing and debugging. - * Support another case of realloc via move into top - * Fix error occuring when initial sbrk_base not word-aligned. - * Rely on page size for units instead of SBRK_UNIT to - avoid surprises about sbrk alignment conventions. - * Add mallinfo, mallopt. Thanks to Raymond Nijssen - (raymond@es.ele.tue.nl) for the suggestion. - * Add `pad' argument to malloc_trim and top_pad mallopt parameter. - * More precautions for cases where other routines call sbrk, - courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de). - * Added macros etc., allowing use in linux libc from - H.J. Lu (hjl@gnu.ai.mit.edu) - * Inverted this history list - - V2.6.1 Sat Dec 2 14:10:57 1995 Doug Lea (dl at gee) - * Re-tuned and fixed to behave more nicely with V2.6.0 changes. - * Removed all preallocation code since under current scheme - the work required to undo bad preallocations exceeds - the work saved in good cases for most test programs. - * No longer use return list or unconsolidated bins since - no scheme using them consistently outperforms those that don't - given above changes. - * Use best fit for very large chunks to prevent some worst-cases. - * Added some support for debugging - - V2.6.0 Sat Nov 4 07:05:23 1995 Doug Lea (dl at gee) - * Removed footers when chunks are in use. Thanks to - Paul Wilson (wilson@cs.texas.edu) for the suggestion. - - V2.5.4 Wed Nov 1 07:54:51 1995 Doug Lea (dl at gee) - * Added malloc_trim, with help from Wolfram Gloger - (wmglo@Dent.MED.Uni-Muenchen.DE). - - V2.5.3 Tue Apr 26 10:16:01 1994 Doug Lea (dl at g) - - V2.5.2 Tue Apr 5 16:20:40 1994 Doug Lea (dl at g) - * realloc: try to expand in both directions - * malloc: swap order of clean-bin strategy; - * realloc: only conditionally expand backwards - * Try not to scavenge used bins - * Use bin counts as a guide to preallocation - * Occasionally bin return list chunks in first scan - * Add a few optimizations from colin@nyx10.cs.du.edu - - V2.5.1 Sat Aug 14 15:40:43 1993 Doug Lea (dl at g) - * faster bin computation & slightly different binning - * merged all consolidations to one part of malloc proper - (eliminating old malloc_find_space & malloc_clean_bin) - * Scan 2 returns chunks (not just 1) - * Propagate failure in realloc if malloc returns 0 - * Add stuff to allow compilation on non-ANSI compilers - from kpv@research.att.com - - V2.5 Sat Aug 7 07:41:59 1993 Doug Lea (dl at g.oswego.edu) - * removed potential for odd address access in prev_chunk - * removed dependency on getpagesize.h - * misc cosmetics and a bit more internal documentation - * anticosmetics: mangled names in macros to evade debugger strangeness - * tested on sparc, hp-700, dec-mips, rs6000 - with gcc & native cc (hp, dec only) allowing - Detlefs & Zorn comparison study (in SIGPLAN Notices.) - - Trial version Fri Aug 28 13:14:29 1992 Doug Lea (dl at g.oswego.edu) - * Based loosely on libg++-1.2X malloc. (It retains some of the overall - structure of old version, but most details differ.) - -*/ diff --git a/Src/AmrTask/rts_impls/Utils/dl_malloc.h b/Src/AmrTask/rts_impls/Utils/dl_malloc.h deleted file mode 100644 index 5c047ced436..00000000000 --- a/Src/AmrTask/rts_impls/Utils/dl_malloc.h +++ /dev/null @@ -1,622 +0,0 @@ -/* - Default header file for malloc-2.8.x, written by Doug Lea - and released to the public domain, as explained at - http://creativecommons.org/publicdomain/zero/1.0/ - - This header is for ANSI C/C++ only. You can set any of - the following #defines before including: - - * If USE_DL_PREFIX is defined, it is assumed that malloc.c - was also compiled with this option, so all routines - have names starting with "dl". - - * If HAVE_USR_INCLUDE_MALLOC_H is defined, it is assumed that this - file will be #included AFTER . This is needed only if - your system defines a struct mallinfo that is incompatible with the - standard one declared here. Otherwise, you can include this file - INSTEAD of your system system . At least on ANSI, all - declarations should be compatible with system versions - - * If MSPACES is defined, declarations for mspace versions are included. -*/ - -#ifndef MALLOC_280_H -#define MALLOC_280_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include /* for size_t */ - -#ifndef ONLY_MSPACES -#define ONLY_MSPACES 0 /* define to a value */ -#elif ONLY_MSPACES != 0 -#define ONLY_MSPACES 1 -#endif /* ONLY_MSPACES */ -#ifndef NO_MALLINFO -#define NO_MALLINFO 0 -#endif /* NO_MALLINFO */ - -#ifndef MSPACES -#if ONLY_MSPACES -#define MSPACES 1 -#else /* ONLY_MSPACES */ -#define MSPACES 0 -#endif /* ONLY_MSPACES */ -#endif /* MSPACES */ - -// YZ: moved mallinfo before "#if !ONLY_MSPACES" because mspace may use mallinfo too! -#if !NO_MALLINFO -#ifndef HAVE_USR_INCLUDE_MALLOC_H -#ifndef _MALLOC_H -#ifndef MALLINFO_FIELD_TYPE -#define MALLINFO_FIELD_TYPE size_t -#endif /* MALLINFO_FIELD_TYPE */ -#ifndef STRUCT_MALLINFO_DECLARED -#define STRUCT_MALLINFO_DECLARED 1 - struct mallinfo { - MALLINFO_FIELD_TYPE arena; /* non-mmapped space allocated from system */ - MALLINFO_FIELD_TYPE ordblks; /* number of free chunks */ - MALLINFO_FIELD_TYPE smblks; /* always 0 */ - MALLINFO_FIELD_TYPE hblks; /* always 0 */ - MALLINFO_FIELD_TYPE hblkhd; /* space in mmapped regions */ - MALLINFO_FIELD_TYPE usmblks; /* maximum total allocated space */ - MALLINFO_FIELD_TYPE fsmblks; /* always 0 */ - MALLINFO_FIELD_TYPE uordblks; /* total allocated space */ - MALLINFO_FIELD_TYPE fordblks; /* total free space */ - MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */ - }; -#endif /* STRUCT_MALLINFO_DECLARED */ -#endif /* _MALLOC_H */ -#endif /* HAVE_USR_INCLUDE_MALLOC_H */ -#endif /* !NO_MALLINFO */ - -#if !ONLY_MSPACES - -#ifndef USE_DL_PREFIX -#define dlcalloc calloc -#define dlfree free -#define dlmalloc malloc -#define dlmemalign memalign -#define dlposix_memalign posix_memalign -#define dlrealloc realloc -#define dlvalloc valloc -#define dlpvalloc pvalloc -#define dlmallinfo mallinfo -#define dlmallopt mallopt -#define dlmalloc_trim malloc_trim -#define dlmalloc_stats malloc_stats -#define dlmalloc_usable_size malloc_usable_size -#define dlmalloc_footprint malloc_footprint -#define dlmalloc_max_footprint malloc_max_footprint -#define dlmalloc_footprint_limit malloc_footprint_limit -#define dlmalloc_set_footprint_limit malloc_set_footprint_limit -#define dlmalloc_inspect_all malloc_inspect_all -#define dlindependent_calloc independent_calloc -#define dlindependent_comalloc independent_comalloc -#define dlbulk_free bulk_free -#endif /* USE_DL_PREFIX */ - -/* - malloc(size_t n) - Returns a pointer to a newly allocated chunk of at least n bytes, or - null if no space is available, in which case errno is set to ENOMEM - on ANSI C systems. - - If n is zero, malloc returns a minimum-sized chunk. (The minimum - size is 16 bytes on most 32bit systems, and 32 bytes on 64bit - systems.) Note that size_t is an unsigned type, so calls with - arguments that would be negative if signed are interpreted as - requests for huge amounts of space, which will often fail. The - maximum supported value of n differs across systems, but is in all - cases less than the maximum representable value of a size_t. -*/ -void* dlmalloc(size_t); - -/* - free(void* p) - Releases the chunk of memory pointed to by p, that had been previously - allocated using malloc or a related routine such as realloc. - It has no effect if p is null. If p was not malloced or already - freed, free(p) will by default cuase the current program to abort. -*/ -void dlfree(void*); - -/* - calloc(size_t n_elements, size_t element_size); - Returns a pointer to n_elements * element_size bytes, with all locations - set to zero. -*/ -void* dlcalloc(size_t, size_t); - -/* - realloc(void* p, size_t n) - Returns a pointer to a chunk of size n that contains the same data - as does chunk p up to the minimum of (n, p's size) bytes, or null - if no space is available. - - The returned pointer may or may not be the same as p. The algorithm - prefers extending p in most cases when possible, otherwise it - employs the equivalent of a malloc-copy-free sequence. - - If p is null, realloc is equivalent to malloc. - - If space is not available, realloc returns null, errno is set (if on - ANSI) and p is NOT freed. - - if n is for fewer bytes than already held by p, the newly unused - space is lopped off and freed if possible. realloc with a size - argument of zero (re)allocates a minimum-sized chunk. - - The old unix realloc convention of allowing the last-free'd chunk - to be used as an argument to realloc is not supported. -*/ -void* dlrealloc(void*, size_t); - -/* - realloc_in_place(void* p, size_t n) - Resizes the space allocated for p to size n, only if this can be - done without moving p (i.e., only if there is adjacent space - available if n is greater than p's current allocated size, or n is - less than or equal to p's size). This may be used instead of plain - realloc if an alternative allocation strategy is needed upon failure - to expand space; for example, reallocation of a buffer that must be - memory-aligned or cleared. You can use realloc_in_place to trigger - these alternatives only when needed. - - Returns p if successful; otherwise null. -*/ -void* dlrealloc_in_place(void*, size_t); - -/* - memalign(size_t alignment, size_t n); - Returns a pointer to a newly allocated chunk of n bytes, aligned - in accord with the alignment argument. - - The alignment argument should be a power of two. If the argument is - not a power of two, the nearest greater power is used. - 8-byte alignment is guaranteed by normal malloc calls, so don't - bother calling memalign with an argument of 8 or less. - - Overreliance on memalign is a sure way to fragment space. -*/ -void* dlmemalign(size_t, size_t); - -/* - int posix_memalign(void** pp, size_t alignment, size_t n); - Allocates a chunk of n bytes, aligned in accord with the alignment - argument. Differs from memalign only in that it (1) assigns the - allocated memory to *pp rather than returning it, (2) fails and - returns EINVAL if the alignment is not a power of two (3) fails and - returns ENOMEM if memory cannot be allocated. -*/ -int dlposix_memalign(void**, size_t, size_t); - -/* - valloc(size_t n); - Equivalent to memalign(pagesize, n), where pagesize is the page - size of the system. If the pagesize is unknown, 4096 is used. -*/ -void* dlvalloc(size_t); - -/* - mallopt(int parameter_number, int parameter_value) - Sets tunable parameters The format is to provide a - (parameter-number, parameter-value) pair. mallopt then sets the - corresponding parameter to the argument value if it can (i.e., so - long as the value is meaningful), and returns 1 if successful else - 0. SVID/XPG/ANSI defines four standard param numbers for mallopt, - normally defined in malloc.h. None of these are use in this malloc, - so setting them has no effect. But this malloc also supports other - options in mallopt: - - Symbol param # default allowed param values - M_TRIM_THRESHOLD -1 2*1024*1024 any (-1U disables trimming) - M_GRANULARITY -2 page size any power of 2 >= page size - M_MMAP_THRESHOLD -3 256*1024 any (or 0 if no MMAP support) -*/ -int dlmallopt(int, int); - -#define M_TRIM_THRESHOLD (-1) -#define M_GRANULARITY (-2) -#define M_MMAP_THRESHOLD (-3) - - -/* - malloc_footprint(); - Returns the number of bytes obtained from the system. The total - number of bytes allocated by malloc, realloc etc., is less than this - value. Unlike mallinfo, this function returns only a precomputed - result, so can be called frequently to monitor memory consumption. - Even if locks are otherwise defined, this function does not use them, - so results might not be up to date. -*/ -size_t dlmalloc_footprint(void); - -/* - malloc_max_footprint(); - Returns the maximum number of bytes obtained from the system. This - value will be greater than current footprint if deallocated space - has been reclaimed by the system. The peak number of bytes allocated - by malloc, realloc etc., is less than this value. Unlike mallinfo, - this function returns only a precomputed result, so can be called - frequently to monitor memory consumption. Even if locks are - otherwise defined, this function does not use them, so results might - not be up to date. -*/ -size_t dlmalloc_max_footprint(void); - -/* - malloc_footprint_limit(); - Returns the number of bytes that the heap is allowed to obtain from - the system, returning the last value returned by - malloc_set_footprint_limit, or the maximum size_t value if - never set. The returned value reflects a permission. There is no - guarantee that this number of bytes can actually be obtained from - the system. -*/ -size_t dlmalloc_footprint_limit(void); - -/* - malloc_set_footprint_limit(); - Sets the maximum number of bytes to obtain from the system, causing - failure returns from malloc and related functions upon attempts to - exceed this value. The argument value may be subject to page - rounding to an enforceable limit; this actual value is returned. - Using an argument of the maximum possible size_t effectively - disables checks. If the argument is less than or equal to the - current malloc_footprint, then all future allocations that require - additional system memory will fail. However, invocation cannot - retroactively deallocate existing used memory. -*/ -size_t dlmalloc_set_footprint_limit(size_t bytes); - -/* - malloc_inspect_all(void(*handler)(void *start, - void *end, - size_t used_bytes, - void* callback_arg), - void* arg); - Traverses the heap and calls the given handler for each managed - region, skipping all bytes that are (or may be) used for bookkeeping - purposes. Traversal does not include include chunks that have been - directly memory mapped. Each reported region begins at the start - address, and continues up to but not including the end address. The - first used_bytes of the region contain allocated data. If - used_bytes is zero, the region is unallocated. The handler is - invoked with the given callback argument. If locks are defined, they - are held during the entire traversal. It is a bad idea to invoke - other malloc functions from within the handler. - - For example, to count the number of in-use chunks with size greater - than 1000, you could write: - static int count = 0; - void count_chunks(void* start, void* end, size_t used, void* arg) { - if (used >= 1000) ++count; - } - then: - malloc_inspect_all(count_chunks, NULL); - - malloc_inspect_all is compiled only if MALLOC_INSPECT_ALL is defined. -*/ -void dlmalloc_inspect_all(void(*handler)(void*, void *, size_t, void*), - void* arg); - -#if !NO_MALLINFO -/* - mallinfo() - Returns (by copy) a struct containing various summary statistics: - - arena: current total non-mmapped bytes allocated from system - ordblks: the number of free chunks - smblks: always zero. - hblks: current number of mmapped regions - hblkhd: total bytes held in mmapped regions - usmblks: the maximum total allocated space. This will be greater - than current total if trimming has occurred. - fsmblks: always zero - uordblks: current total allocated space (normal or mmapped) - fordblks: total free space - keepcost: the maximum number of bytes that could ideally be released - back to system via malloc_trim. ("ideally" means that - it ignores page restrictions etc.) - - Because these fields are ints, but internal bookkeeping may - be kept as longs, the reported values may wrap around zero and - thus be inaccurate. -*/ - -struct mallinfo dlmallinfo(void); -#endif /* NO_MALLINFO */ - -/* - independent_calloc(size_t n_elements, size_t element_size, void* chunks[]); - - independent_calloc is similar to calloc, but instead of returning a - single cleared space, it returns an array of pointers to n_elements - independent elements that can hold contents of size elem_size, each - of which starts out cleared, and can be independently freed, - realloc'ed etc. The elements are guaranteed to be adjacently - allocated (this is not guaranteed to occur with multiple callocs or - mallocs), which may also improve cache locality in some - applications. - - The "chunks" argument is optional (i.e., may be null, which is - probably the most typical usage). If it is null, the returned array - is itself dynamically allocated and should also be freed when it is - no longer needed. Otherwise, the chunks array must be of at least - n_elements in length. It is filled in with the pointers to the - chunks. - - In either case, independent_calloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and "chunks" - is null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be freed when it is no longer needed. This can be - done all at once using bulk_free. - - independent_calloc simplifies and speeds up implementations of many - kinds of pools. It may also be useful when constructing large data - structures that initially have a fixed number of fixed-sized nodes, - but the number is not known at compile time, and some of the nodes - may later need to be freed. For example: - - struct Node { int item; struct Node* next; }; - - struct Node* build_list() { - struct Node** pool; - int n = read_number_of_nodes_needed(); - if (n <= 0) return 0; - pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0); - if (pool == 0) die(); - // organize into a linked list... - struct Node* first = pool[0]; - for (i = 0; i < n-1; ++i) - pool[i]->next = pool[i+1]; - free(pool); // Can now free the array (or not, if it is needed later) - return first; - } -*/ -void** dlindependent_calloc(size_t, size_t, void**); - -/* - independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]); - - independent_comalloc allocates, all at once, a set of n_elements - chunks with sizes indicated in the "sizes" array. It returns - an array of pointers to these elements, each of which can be - independently freed, realloc'ed etc. The elements are guaranteed to - be adjacently allocated (this is not guaranteed to occur with - multiple callocs or mallocs), which may also improve cache locality - in some applications. - - The "chunks" argument is optional (i.e., may be null). If it is null - the returned array is itself dynamically allocated and should also - be freed when it is no longer needed. Otherwise, the chunks array - must be of at least n_elements in length. It is filled in with the - pointers to the chunks. - - In either case, independent_comalloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and chunks is - null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be freed when it is no longer needed. This can be - done all at once using bulk_free. - - independent_comallac differs from independent_calloc in that each - element may have a different size, and also that it does not - automatically clear elements. - - independent_comalloc can be used to speed up allocation in cases - where several structs or objects must always be allocated at the - same time. For example: - - struct Head { ... } - struct Foot { ... } - - void send_message(char* msg) { - int msglen = strlen(msg); - size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) }; - void* chunks[3]; - if (independent_comalloc(3, sizes, chunks) == 0) - die(); - struct Head* head = (struct Head*)(chunks[0]); - char* body = (char*)(chunks[1]); - struct Foot* foot = (struct Foot*)(chunks[2]); - // ... - } - - In general though, independent_comalloc is worth using only for - larger values of n_elements. For small values, you probably won't - detect enough difference from series of malloc calls to bother. - - Overuse of independent_comalloc can increase overall memory usage, - since it cannot reuse existing noncontiguous small chunks that - might be available for some of the elements. -*/ -void** dlindependent_comalloc(size_t, size_t*, void**); - -/* - bulk_free(void* array[], size_t n_elements) - Frees and clears (sets to null) each non-null pointer in the given - array. This is likely to be faster than freeing them one-by-one. - If footers are used, pointers that have been allocated in different - mspaces are not freed or cleared, and the count of all such pointers - is returned. For large arrays of pointers with poor locality, it - may be worthwhile to sort this array before calling bulk_free. -*/ -size_t dlbulk_free(void**, size_t n_elements); - -/* - pvalloc(size_t n); - Equivalent to valloc(minimum-page-that-holds(n)), that is, - round up n to nearest pagesize. - */ -void* dlpvalloc(size_t); - -/* - malloc_trim(size_t pad); - - If possible, gives memory back to the system (via negative arguments - to sbrk) if there is unused memory at the `high' end of the malloc - pool or in unused MMAP segments. You can call this after freeing - large blocks of memory to potentially reduce the system-level memory - requirements of a program. However, it cannot guarantee to reduce - memory. Under some allocation patterns, some large free blocks of - memory will be locked between two used chunks, so they cannot be - given back to the system. - - The `pad' argument to malloc_trim represents the amount of free - trailing space to leave untrimmed. If this argument is zero, only - the minimum amount of memory to maintain internal data structures - will be left. Non-zero arguments can be supplied to maintain enough - trailing space to service future expected allocations without having - to re-obtain memory from the system. - - Malloc_trim returns 1 if it actually released any memory, else 0. -*/ -int dlmalloc_trim(size_t); - -/* - malloc_stats(); - Prints on stderr the amount of space obtained from the system (both - via sbrk and mmap), the maximum amount (which may be more than - current if malloc_trim and/or munmap got called), and the current - number of bytes allocated via malloc (or realloc, etc) but not yet - freed. Note that this is the number of bytes allocated, not the - number requested. It will be larger than the number requested - because of alignment and bookkeeping overhead. Because it includes - alignment wastage as being in use, this figure may be greater than - zero even when no user-level chunks are allocated. - - The reported current and maximum system memory can be inaccurate if - a program makes other calls to system memory allocation functions - (normally sbrk) outside of malloc. - - malloc_stats prints only the most commonly interesting statistics. - More information can be obtained by calling mallinfo. - - malloc_stats is not compiled if NO_MALLOC_STATS is defined. -*/ -void dlmalloc_stats(void); - -#endif /* !ONLY_MSPACES */ - -/* - malloc_usable_size(void* p); - - Returns the number of bytes you can actually use in - an allocated chunk, which may be more than you requested (although - often not) due to alignment and minimum size constraints. - You can use this many bytes without worrying about - overwriting other allocated objects. This is not a particularly great - programming practice. malloc_usable_size can be more useful in - debugging and assertions, for example: - - p = malloc(n); - assert(malloc_usable_size(p) >= 256); -*/ -size_t dlmalloc_usable_size(const void*); - -#if MSPACES - -/* - mspace is an opaque type representing an independent - region of space that supports mspace_malloc, etc. -*/ -typedef void* mspace; - -/* - create_mspace creates and returns a new independent space with the - given initial capacity, or, if 0, the default granularity size. It - returns null if there is no system memory available to create the - space. If argument locked is non-zero, the space uses a separate - lock to control access. The capacity of the space will grow - dynamically as needed to service mspace_malloc requests. You can - control the sizes of incremental increases of this space by - compiling with a different DEFAULT_GRANULARITY or dynamically - setting with mallopt(M_GRANULARITY, value). -*/ -mspace create_mspace(size_t capacity, int locked); - -/* - destroy_mspace destroys the given space, and attempts to return all - of its memory back to the system, returning the total number of - bytes freed. After destruction, the results of access to all memory - used by the space become undefined. -*/ -size_t destroy_mspace(mspace msp); - -/* - create_mspace_with_base uses the memory supplied as the initial base - of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this - space is used for bookkeeping, so the capacity must be at least this - large. (Otherwise 0 is returned.) When this initial space is - exhausted, additional memory will be obtained from the system. - Destroying this space will deallocate all additionally allocated - space (if possible) but not the initial base. -*/ -mspace create_mspace_with_base(void* base, size_t capacity, int locked); -mspace create_device_mspace_with_base(void* base, size_t capacity, int locked); - -/* - mspace_track_large_chunks controls whether requests for large chunks - are allocated in their own untracked mmapped regions, separate from - others in this mspace. By default large chunks are not tracked, - which reduces fragmentation. However, such chunks are not - necessarily released to the system upon destroy_mspace. Enabling - tracking by setting to true may increase fragmentation, but avoids - leakage when relying on destroy_mspace to release all memory - allocated using this space. The function returns the previous - setting. -*/ -int mspace_track_large_chunks(mspace msp, int enable); - -#if !NO_MALLINFO -/* - mspace_mallinfo behaves as mallinfo, but reports properties of - the given space. -*/ -struct mallinfo mspace_mallinfo(mspace msp); -#endif /* NO_MALLINFO */ - -/* - An alias for mallopt. -*/ -int mspace_mallopt(int, int); - -/* - The following operate identically to their malloc counterparts - but operate only for the given mspace argument -*/ -void* mspace_malloc(mspace msp, size_t bytes); -void mspace_free(mspace msp, void* mem); -void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size); -void* mspace_realloc(mspace msp, void* mem, size_t newsize); -void* mspace_realloc_in_place(mspace msp, void* mem, size_t newsize); -void* mspace_memalign(mspace msp, size_t alignment, size_t bytes); -void** mspace_independent_calloc(mspace msp, size_t n_elements, - size_t elem_size, void* chunks[]); -void** mspace_independent_comalloc(mspace msp, size_t n_elements, - size_t sizes[], void* chunks[]); -size_t mspace_bulk_free(mspace msp, void**, size_t n_elements); -size_t mspace_usable_size(const void* mem); -void mspace_malloc_stats(mspace msp); -int mspace_trim(mspace msp, size_t pad); -size_t mspace_footprint(mspace msp); -size_t mspace_max_footprint(mspace msp); -size_t mspace_footprint_limit(mspace msp); -size_t mspace_set_footprint_limit(mspace msp, size_t bytes); -void mspace_inspect_all(mspace msp, - void(*handler)(void *, void *, size_t, void*), - void* arg); -#endif /* MSPACES */ - -#ifdef __cplusplus -}; /* end of extern "C" */ -#endif - -#endif /* MALLOC_280_H */ diff --git a/Src/AmrTask/rts_impls/Utils/sysInfo.C b/Src/AmrTask/rts_impls/Utils/sysInfo.C deleted file mode 100644 index 3f4974f3f5d..00000000000 --- a/Src/AmrTask/rts_impls/Utils/sysInfo.C +++ /dev/null @@ -1,138 +0,0 @@ -//This code was developed in Rambutan to parse machine information - -#include "sysInfo.H" -#include -#include -#include -#include -using namespace std; - - -namespace { - bool read_line_prefix(istream &is, string &prefix) { - int spaces = 0; - - prefix.clear(); - - while(true) { - int c = is.get(); - - if(c == EOF) - return false; - - if(c == '\n') { - prefix.clear(); - spaces = 0; - continue; - } - - if(c == ':') - return true; - - if(c == ' ') - spaces += 1; - else if(c == '\t') - ; - else { - while(spaces > 0) { - prefix.push_back(' '); - spaces -= 1; - } - prefix.push_back((char)c); - } - } - } -} - -NodeHardware query_node_hardware() { - NodeHardware ans; - - struct Entry { - int proc; - int phys; - int core; - int sib_n; - int core_n; - }; - - vector procs; - - { // parse /proc/cpuinfo - ifstream f("/proc/cpuinfo", ios::in); - - string pre; - - Entry e; - int n = 5; - - while(read_line_prefix(f, pre)) { - if(pre == "processor") { - assert(n == 5); - f >> e.proc; - } - else if(pre == "physical id") - f >> e.phys; - else if(pre == "siblings") - f >> e.sib_n; - else if(pre == "core id") - f >> e.core; - else if(pre == "cpu cores") - f >> e.core_n; - else - continue; - - n -= 1; - if(n == 0) { - procs.push_back(e); - n = 5; - } - } - } - - // sort procs by "proc id", weird if wasnt already sorted by OS - std::sort(procs.begin(), procs.end(), [](Entry a, Entry b) { return a.proc < b.proc; }); - - // assert OS numbers procs contiguously from zero - bool ok = true; - for(int i=0; i < (int)procs.size(); i++) - ok = ok && procs[i].proc == i; - assert(ok); - - ans.numa_per_node = procs.size() / procs[0].sib_n; - ans.core_per_numa = procs[0].core_n; - ans.thread_per_core = procs[0].sib_n / procs[0].core_n; - - if(procs.size() == 1) { - ans.thread_stride = 0; - ans.core_stride = 0; - ans.numa_stride = 0; - } - else { - ans.thread_stride = procs.size(); - for(int i=1; i < (int)procs.size(); i++) { - if(procs[i].core == procs[0].core && procs[i].phys == procs[0].phys) { - ans.thread_stride = i; - break; - } - } - - ans.core_stride = procs.size(); - for(int i=1; i < (int)procs.size(); i++) { - if(procs[i].core != procs[0].core) { - ans.core_stride = i; - break; - } - } - - ans.numa_stride = procs.size(); - for(int i=1; i < (int)procs.size(); i++) { - if(procs[i].phys != procs[0].phys) { - ans.numa_stride = i; - break; - } - } - } - - return ans; -} - diff --git a/Src/AmrTask/rts_impls/Utils/sysInfo.H b/Src/AmrTask/rts_impls/Utils/sysInfo.H deleted file mode 100644 index 19979e7187c..00000000000 --- a/Src/AmrTask/rts_impls/Utils/sysInfo.H +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef NODE_INFO -#define NODE_INFO - -# include -# include -# include -# include -# include - -// Implementation provided machine primitives. - - - struct NodeHardware { - int thread_per_core; - int core_per_numa; - int numa_per_node; - - // cpuid indexing dimension order - int thread_stride; - int core_stride; - int numa_stride; - - int thread_per_numa() const { return thread_per_core*core_per_numa; } - int thread_per_node() const { return thread_per_core*core_per_numa*numa_per_node; } - int core_per_node() const { return core_per_numa*numa_per_node; } - - int numa_of_core(int core) const { return core/core_per_numa; } - int numa_of_thread(int thread) const { return thread/(thread_per_core*core_per_numa); } - - int core_of_thread(int thread) const { return thread/thread_per_core; } - - int cpuid_of(int thread, int core, int numa) const { - return thread*thread_stride + core*core_stride + numa*numa_stride; - } - }; - - - NodeHardware query_node_hardware(); -#endif diff --git a/Src/AmrTask/rts_impls/mpi/Make.package b/Src/AmrTask/rts_impls/mpi/Make.package deleted file mode 100644 index 21973fa39b2..00000000000 --- a/Src/AmrTask/rts_impls/mpi/Make.package +++ /dev/null @@ -1,12 +0,0 @@ -PERILLA_LIB=EXE -COMMON_DIR=$(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common - -C$(PERILLA_LIB)_sources += PackageQueue.cpp Perilla.cpp PerillaRts.cpp - -C$(PERILLA_LIB)_headers += $(COMMON_DIR)/Barrier.H Config.H $(COMMON_DIR)/LocalConnection.H PackageQueue.H $(COMMON_DIR)/RegionGraph.H $(COMMON_DIR)/RGIter.H $(COMMON_DIR)/RegionQueue.H $(COMMON_DIR)/RemoteConnection.H $(COMMON_DIR)/WorkerThread.H $(COMMON_DIR)/AsyncMultiFabUtil.H PerillaRts.H - -include $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common/perilla.mak -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/mpi -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/mpi diff --git a/Src/AmrTask/rts_impls/mpi/PackageQueue.H b/Src/AmrTask/rts_impls/mpi/PackageQueue.H deleted file mode 100755 index 362f58c5037..00000000000 --- a/Src/AmrTask/rts_impls/mpi/PackageQueue.H +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef P_PACKAGEQUEUE_H -#define P_PACKAGEQUEUE_H - -#include -#include -#include - -class Package -{ -private: - int source, destination; -public: - double *databuf; - pthread_mutex_t packageLock; - volatile int bufSize; - volatile bool completed; //message transfer is done - volatile bool served; //message transfer request has been served but may have not completed - MPI_Request request; //!for MPI - Package(); - ~Package(); - Package(int size); - Package(int src, int dest); - Package(int src, int dest, int size); - void setPackageSource(int src); - void setPackageDestination(int dest); - void completeRequest(void); - void completeRequest(bool canAvoidLock); - bool checkRequest(void); - void generatePackage(int size); -}; - -class PackageQueue -{ -private: - Package *buffer[perilla::MSG_QUEUE_DEFAULT_MAXSIZE]; - volatile int n; - volatile int front; - volatile int rear; - volatile int prear; - int max_size; -public: - pthread_mutex_t queueLock; - PackageQueue(); - ~PackageQueue(); - int queueSize(void); - int queueSize(bool canAvoidLock); - void enqueue(Package* package); - void enqueue(Package* package, bool canAvoidLock); - Package* dequeue(void); - Package* dequeue(bool canAvoidLock); - Package* getRear(void); - Package* getRear(bool canAvoidLock); - Package* getFront(void); - Package* getFront(bool canAvoidLock); - void emptyQueue(bool canAvoidLock); -}; -#endif - diff --git a/Src/AmrTask/rts_impls/mpi/PackageQueue.cpp b/Src/AmrTask/rts_impls/mpi/PackageQueue.cpp deleted file mode 100755 index 00fd736b63b..00000000000 --- a/Src/AmrTask/rts_impls/mpi/PackageQueue.cpp +++ /dev/null @@ -1,261 +0,0 @@ -#include -#include -using namespace perilla; -#ifdef PERILLA_DEBUG -#include -extern PerillaMemCheck memcheck; -#endif - -Package::Package() -{ - databuf = 0; - bufSize = 0; - source = 0; - destination = 0; - completed = false; - served = false; - request = MPI_REQUEST_NULL; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::~Package() -{ - if(databuf) free(databuf); -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif -} - -Package::Package(int size) -{ - databuf = new double[size]; - bufSize = size; - source = 0; - destination = 0; - completed = false; - served = false; - request = MPI_REQUEST_NULL; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest) -{ - bufSize = 0; - source = src; - destination = dest; - completed = false; - served = false; - request = MPI_REQUEST_NULL; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest, int size) -{ - databuf = new double[size]; - bufSize = size; - source = src; - destination = dest; - completed = false; - served = false; - request = MPI_REQUEST_NULL; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -void Package::setPackageSource(int src) -{ - source = src; -} - -void Package::setPackageDestination(int dest) -{ - destination = dest; -} - -void Package::completeRequest(void) -{ - pthread_mutex_lock(&packageLock); - completed = true; - pthread_mutex_unlock(&packageLock); -} - -void Package::completeRequest(bool canAvoidLock) -{ - if(!canAvoidLock)pthread_mutex_lock(&packageLock); - completed = true; - if(!canAvoidLock)pthread_mutex_unlock(&packageLock); -} - -bool Package::checkRequest(void) -{ - return completed; -} - -void Package::generatePackage(int size) -{ - databuf = new double[size]; - bufSize = size; - source = 0; - destination = 0; - completed = false; - served = false; - request = MPI_REQUEST_NULL; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -PackageQueue::PackageQueue() -{ - n = 0; - front = 0; - rear = 0; - prear = -1; - queueLock= PTHREAD_MUTEX_INITIALIZER;; -} - -int PackageQueue::queueSize(void) -{ - int size; - pthread_mutex_lock(&queueLock); - size = n; - pthread_mutex_unlock(&queueLock); - return size; -} - -int PackageQueue::queueSize(bool canAvoidLock) -{ - int size; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); - size = n; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return size; -} - -void PackageQueue::enqueue(Package* package) -{ - pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n==perilla::MSG_QUEUE_DEFAULT_MAXSIZE){ - printf("Failed to Enqueue: Queue Overflow\n"); - exit(0); - } -#endif - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - pthread_mutex_unlock(&queueLock); -} - -void PackageQueue::enqueue(Package* package, bool canAvoidLock) -{ - if(!canAvoidLock)pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n==perilla::MSG_QUEUE_DEFAULT_MAXSIZE){ - printf("Failed to Enqueue: Queue Overflow\n"); - exit(0); - } -#endif - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); -} - -Package* PackageQueue::dequeue(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n<0){ - printf("Failed to Dequeue: Queue Empty\n"); - exit(0); - } -#endif - package = buffer[front]; - front = (front+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n--; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::dequeue(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n<0){ - printf("Failed to Dequeue: Queue Empty\n"); - exit(0); - } -#endif - package = buffer[front]; - front = (front+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n--; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getRear(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); - if(n) package = buffer[prear]; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getRear(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); - if(n) package = buffer[prear]; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getFront(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); - if(n) package = buffer[front]; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getFront(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock) pthread_mutex_lock(&queueLock); - if(n) package = buffer[front]; - if(!canAvoidLock) pthread_mutex_unlock(&queueLock); - return package; -} - -void PackageQueue::emptyQueue(bool canAvoidLock){ - if(!canAvoidLock) pthread_mutex_lock(&queueLock); - while(n){ - Package* p= dequeue(true); - delete p; - } - if(!canAvoidLock) pthread_mutex_unlock(&queueLock); -} - -PackageQueue::~PackageQueue() -{ - emptyQueue(true); -} diff --git a/Src/AmrTask/rts_impls/mpi/Perilla.H b/Src/AmrTask/rts_impls/mpi/Perilla.H deleted file mode 100755 index 2bc3301ae9e..00000000000 --- a/Src/AmrTask/rts_impls/mpi/Perilla.H +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef _PERILLA_ -#define _PERILLA_ - -#include -#include -#include -#include -#include - -#include - -//#define USE_PERILLA_PTHREADS - -using namespace std; - -namespace amrex{ - class Perilla - { - static int tagGen(int src, int dest, int channelID, int nFabs, int nChannels); - - public: - static int uTags; - static bool genTags; - static int max_step; - - static std::map> pTagCnt; - static std::map>>>> tagMap; - static std::map>>> myTagMap; - static void clearTagMap(); - static void clearMyTagMap(); - static void communicateTags(); - static void registerId(int tid); - static int tid(); - static volatile int numTeamsFinished; - static volatile int updateMetadata_request; - static volatile int updateMetadata_noticed; - static volatile int updateMetadata_done; - static Barrier * globalBarrier; - static void syncProcesses(); - static void multifabBuildFabCon(RegionGraph* graph, const MultiFab& mf, const Periodicity& period); - static void serviceLocalRequests(RegionGraph *graph, int tg); - static void serviceRemoteRequests(RegionGraph *graph, int graphID, int nGraphs); - static void serviceRemoteRequests(RegionGraph *graph); - static void serviceSingleGraphComm(RegionGraph* graph, int tid); - static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, bool cpyAcross, int tid); - static void serviceMultipleGraphCommDynamic(std::vectorgraphArray, bool cpyAcross, int tid); - static void flattenGraphHierarchy(std::vector >graphArray, std::vector &flattenedGraphArray); - static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, int tid); - static void fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f); - static void fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT); - - static void serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg); - static void serviceRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - static void resetRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - - static void fillBoundaryPush(amrex::RGIter& rgi, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, amrex::MultiFab& mf, bool singleT); - static void fillBoundaryPush(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf, bool singleT); - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - - void multifabExtractCopyAssoc(void* threadInfo); - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, int nc, int ng, int ngSrc, const Periodicity& period); - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, const Periodicity& period); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - - - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, bool singleT); - - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - - //static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT, bool mTeams=true); - - //static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT, bool mTeams=true); - - //static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, bool singleT, bool mTeams=true); - //static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT, bool mTeams=true); - - - static void multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - //static void fillBoundaryPush_1Team(RegionGraph *graph, amrex::MultiFab& mf, int f, bool mOneThread=false); - - //static void multifabCopyPull_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void fillBoundaryPull_1Team(RegionGraph *graph, amrex::MultiFab& mf, int f); - }; // class Perilla -} -#endif diff --git a/Src/AmrTask/rts_impls/mpi/Perilla.cpp b/Src/AmrTask/rts_impls/mpi/Perilla.cpp deleted file mode 100755 index 8a9fb76038d..00000000000 --- a/Src/AmrTask/rts_impls/mpi/Perilla.cpp +++ /dev/null @@ -1,976 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -using namespace amrex; -using namespace perilla; - -#ifdef PERILLA_DEBUG -#include -#include - -const double kMicro = 1.0e-6; -double getTime() -{ - struct timeval TV; - const int RC = gettimeofday(&TV, NULL); - if(RC == -1) - { - printf("ERROR: Bad call to gettimeofday\n"); - return(-1); - } - return( ((double)TV.tv_sec) + kMicro * ((double)TV.tv_usec) ); -} - -double isendDelay=0.0; -double irecvDelay=0.0; -double isendTestDelay=0.0; -double irecvTestDelay=0.0; -double localScheDelay=0.0; -#endif - -void Perilla::syncProcesses(){ - MPI_Barrier(MPI_COMM_WORLD); -} - -void Perilla::serviceLocalRequests(RegionGraph* rg, int tg) -{ - int numfabs = rg->lMap.size(); - for(int f=0; flMap[f]->l_con.nscpy; i++) - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize(true)>0){ - anyReq=true; - break; - } - if(anyReq){ - pthread_mutex_lock(&(rg->lMap[f]->l_con.sLock)); - for(int i=0; ilMap[f]->l_con.nscpy; i++){ - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize(true)>0) - { - Package *sPackage = rg->lMap[f]->l_con.scpy[i].pQueue.dequeue(true); - pthread_mutex_lock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - int dPartner = rg->lMap[f]->l_con.scpy[i].dPartner; - Package *dPackage = rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - std::memcpy(dPackage->databuf, sPackage->databuf, dPackage->bufSize * sizeof(double)); - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.queueSize(true)==1) - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - rg->lMap[f]->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - } - } - pthread_mutex_unlock(&(rg->lMap[f]->l_con.sLock)); - }//if there is any local send request - }// if my region - }// for(frMap.size(); - int tg = WorkerThread::perilla_wid(); - - // !we first pre-post receive - for(int f=0; flMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - nextrReq = true; - else - { - //we buffer at most 2 packages per send task - recv task pair, but 1 must be completed before we buffer the next to allow for tag reuse - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true); - if(rearPackage) - if(rearPackage->completed && rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - nextrReq = true; - else //!expected message is still on the way - nextrReq = false; - } - if(nextrReq) //!take a message from recycle pool and post a receive - { - pthread_mutex_lock(&(rg->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - int ns = rg->rMap[f]->r_con.rcv[i].ns; - int nd = rg->rMap[f]->r_con.rcv[i].nd; - int lnd = rg->rMap[f]->r_con.rcv[i].lnd; - int r_grids = rg->rMap[f]->r_con.rcv[i].r_grids; - //!create a package to keep track of receive requests - Package *rMetaPackage = rg->rMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - Package *rPackage = rg->lMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - int tag = tagMap[rg->rMap[f]->r_con.rcv[i].pr][graphID][nd][ns][rg->rMap[f]->r_con.rcv[i].sz]; - - rMetaPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - rMetaPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - rg->rMap[f]->r_con.rcv[i].sz, - rg->rMap[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(rg->rMap[f]->r_con.rcvLock)); - } - }//for num messages in each Fab - }// for(fsMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //then !no message has been issued or all send requests have been fulfilled - nextsReq = false; - else - nextsReq = true; - - if(nextsReq) - { - Package *sMetaPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(!sMetaPackage->served) - { - Package *sPackage = rg->lMap[f]->r_con.snd[i].pQueue.getFront(true); - sMetaPackage->completed = false; - sMetaPackage->served = true; - sMetaPackage->request = MPI_REQUEST_NULL; - int ns = rg->sMap[f]->r_con.snd[i].ns; - int nd = rg->sMap[f]->r_con.snd[i].nd; - int r_gid = rg->sMap[f]->r_con.snd[i].r_gid; - int r_grids = rg->sMap[f]->r_con.snd[i].r_grids; - int tag = Perilla::myTagMap[r_gid][nd][ns][rg->sMap[f]->r_con.snd[i].sz]; - sMetaPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - rg->sMap[f]->r_con.snd[i].sz, - rg->sMap[f]->r_con.snd[i].pr, tag).req(); - } - } - } // for(irMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //by our policy, we can assume that all messages before rear have completed - { - //we dont need to lock the queue, because other consumers just take front messages. A circular buffer guarantees that the rear of the queue can be safely accessed when other queue data is modified - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true); - if(rearPackage) - if(!(rearPackage->completed)) - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true; - if(flag) - { - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - rearPackage->completeRequest(true); - rg->lMap[f]->r_con.rcv[i].pQueue.getRear()->completeRequest(true); - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - rg->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - } - } - } // if(queueSize > 0) - } // for(ilMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - { - Package *frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served && !frontPackage->completed) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true; - if(flag) - { - pthread_mutex_lock(&(rg->sMap[f]->r_con.sndLock)); - frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - rg->sMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->sMap[f]->r_con.sndLock)); - pthread_mutex_lock(&(rg->lMap[f]->r_con.sndLock)); - frontPackage = rg->lMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->lMap[f]->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for(i graphArray, bool cpyAcross, int tid) -{ - int tg = WorkerThread::perilla_wid(); - int np = ParallelDescriptor::NProcs(); - int nGraphs = graphArray.size(); - - for(int g=0; g 1) - { - if(tg==0) - { - serviceRemoteRequests(graphArray[g],g,nGraphs); - if(cpyAcross) - serviceRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - } - } - } -}//serviceMultipleGraphCommDynamic - -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int myProc = ParallelDescriptor::MyProc(); - - int ntid = WorkerThread::perilla_wtid(); - int tg = WorkerThread::perilla_wid(); - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - FabCopyAssoc* cpDst = destGraph->task[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == srcGraph) - break; - cpDst = cpDst->next; - } - if(cpDst == 0) cout <<"Metadata for across grid copy not found"<< endl; - if(singleT) - { - pthread_mutex_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue()); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - else - { - if(ntid==0){ - pthread_mutex_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue()); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - for(int i=0; ir_con.nrcv; i++) - { - ///* - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); // corrected from pQ to recycleQ - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - else - { - if(ntid==0) - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - for(int i=0; ir_con.nrcv; i++) - { - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); // corrected from pQ to recycleQ - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize() >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront()->checkRequest()) - cpDst->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPull - - -void Perilla::fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->l_con.dLock)); - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *dPackage = graph->lMap[f]->l_con.dcpy[i].pQueue.getFront(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->l_con.dcpy[i].dbx,0,nComp,dPackage->databuf); - } - - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - graph->lMap[f]->l_con.dcpy[i].recycleQueue.enqueue( graph->lMap[f]->l_con.dcpy[i].pQueue.dequeue(true),true ); - - graph->lMap[f]->l_con.firingRuleCnt = graph->lMap[f]->l_con.firingRuleCnt - graph->lMap[f]->l_con.ndcpy; - - graph->lMap[f]->l_con.scpyCnt = 0; - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if(graph->lMap[f]->l_con.dcpy[i].pQueue.queueSize(true) >= 1) - graph->lMap[f]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->l_con.dLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - { - pthread_mutex_lock(&(graph->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(graph->lMap[f]->r_con.rcvLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *rcvMetaPackage = graph->rMap[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = 0; - graph->rMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - Package *rcvPackage = graph->lMap[f]->r_con.rcv[i].pQueue.dequeue(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->r_con.rcv[i].dbx,0,nComp,rcvPackage->databuf); - rcvPackage->completed = false; - graph->lMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - graph->lMap[f]->r_con.firingRuleCnt = graph->lMap[f]->r_con.firingRuleCnt - graph->lMap[f]->r_con.nrcv; - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if(graph->lMap[f]->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(graph->lMap[f]->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - graph->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(graph->rMap[f]->r_con.rcvLock)); - } - -} // fillBoundaryPull - - - -void Perilla::fillBoundaryPull_1Team(RegionGraph* graph, amrex::MultiFab& mf, int f) -{ - exit(0); -} // fillBoundaryPull - - -void Perilla::multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int ntid = WorkerThread::perilla_wtid(); - int tg = WorkerThread::perilla_wid(); - int myProc = ParallelDescriptor::MyProc(); - // MultiFab* mfDst = destGraph->assocMF; - // MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - while(cpSrc != 0) - { - if(cpSrc->graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - for(int i=0; il_con.nscpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - } - else - { - if(ntid == 0) - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - //*/ - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPushAsync - - -void Perilla::fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - if(ntid == 0) - pthread_mutex_lock(&(graph->lMap[f]->l_con.sLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->l_con.nscpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *sPackage = graph->lMap[f]->l_con.scpy[i].recycleQueue.getFront(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->l_con.scpy[i].sbx,0,nComp,sPackage->databuf); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - for(int i=0; ilMap[f]->l_con.nscpy; i++) - { - graph->lMap[f]->l_con.scpy[i].pQueue.enqueue( graph->lMap[f]->l_con.scpy[i].recycleQueue.dequeue(true),true ); - } - pthread_mutex_unlock(&(graph->lMap[f]->l_con.sLock)); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->r_con.sndLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1))==ntid) - { - Package *sndPackage = graph->lMap[f]->r_con.snd[i].recycleQueue.dequeue(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->r_con.snd[i].sbx,0,nComp,sndPackage->databuf); - graph->lMap[f]->r_con.snd[i].pQueue.enqueue( sndPackage,true ); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - pthread_mutex_unlock(&(graph->lMap[f]->r_con.sndLock)); - pthread_mutex_lock(&(graph->sMap[f]->r_con.sndLock)); - for(int i=0; ilMap[f]->r_con.nsnd; i++) - graph->sMap[f]->r_con.snd[i].pQueue.enqueue( graph->sMap[f]->r_con.snd[i].recycleQueue.dequeue(true),true ); - pthread_mutex_unlock(&(graph->sMap[f]->r_con.sndLock)); - } -} // fillBoundaryPush - - - -void Perilla::fillBoundaryPush(amrex::RGIter& rgi, amrex::MultiFab& mf) -{ - if(rgi.currentItr != rgi.totalItr) - return; - - int f = rgi.currentRegion; - fillBoundaryPush(rgi.itrGraph, &mf, f); -} - -void Perilla::fillBoundaryPush(amrex::RGIter& rgi, RegionGraph* rg, amrex::MultiFab& mf) -{ - if(rgi.currentItr != rgi.totalItr) - return; - - int f = rgi.currentRegion; - fillBoundaryPush(rg, &mf, f); -} - - -void Perilla::multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - multifabCopyPush_1Team(destGraph,srcGraph,mfDst,mfSrc,f,dstcomp,srccomp,nc,ng,ngsrc,singleT); -} - - -void Perilla::multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int ntid = perilla::wtid();// - perilla::NUM_COMM_THREADS; - int tg = perilla::wid(); - int myProc = amrex::ParallelDescriptor::MyProc(); - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - while(cpSrc != 0) - { - if(cpSrc->graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true)); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true)); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - } - - int np = amrex::ParallelDescriptor::NProcs(); - if(np == 1) - return; - if(singleT) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage, true); - } - cpSrc->r_con.remotePushReady = true; - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage, true); - } - cpSrc->r_con.remotePushReady = true; - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - } - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPush - - -void Perilla::serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg) -{ - int nfabs = graphArray[g]->numTasks; - - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - bool anyReq=false; - for(int i=0; il_con.nscpy; i++) - if(cpSrc->l_con.scpy[i].pQueue.queueSize(true)>0){ - anyReq=true; - break; - } - if(anyReq) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - if(cpSrc->l_con.scpy[i].pQueue.queueSize(true)>0) - { - FabCopyAssoc* cpDst = cpSrc->graphPartner->task[cpSrc->l_con.scpy[i].nd]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == graphArray[g]) - break; - cpDst = cpDst->next; - } - Package* sPackage = cpSrc->l_con.scpy[i].pQueue.dequeue(true); - pthread_mutex_lock(&(cpDst->l_con.dLock)); - int dPartner = cpSrc->l_con.scpy[i].dPartner; - Package* dPackage = cpDst->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - std::memcpy(dPackage->databuf, sPackage->databuf, dPackage->bufSize * sizeof(double)); - cpDst->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(cpDst->l_con.dcpy[dPartner].pQueue.queueSize(true) == 1) - cpDst->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - cpSrc->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - } - } // for - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - }//anyReq - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // if(tg==fg) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - bool nextsReq, nextrReq; - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - int graphID = graphArray[g]->graphID; - -#ifdef PERILLA_DEBUG - double time= -getTime(); -#endif - for(int f=0; ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - if(cpDst->r_con.rcv[i].pQueue.queueSize(true)==0) - { - nextrReq = true; - } - else - { - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true); - if(rearPackage) - if(rearPackage->completed && cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - { - nextrReq = true; - } - else //!expected message is still on the way - nextrReq = false; - } - if(nextrReq) //!take a message from recycle pool and post a receive - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - int ns = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].ns; - int nd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].nd; - int lnd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int r_grids = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].r_grids; - Package *rPackage = cpDst->r_con.rcv[i].recycleQueue.dequeue(true); - int tag = tagMap[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][g][nd][ns][graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz]; - rPackage->request = MPI_REQUEST_NULL; - rPackage->completed=false; - cpDst->r_con.rcv[i].pQueue.enqueue(rPackage, true); //!this is not done yet - rPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } // for (ir_con.nrcv) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) == 0) - nextsReq = false; - else - nextsReq = true; - - if(nextsReq) - { - //there is no need to lock the queue because we only touch the front to initialize the send - //During this time, workers can produce more messages into the queue, but a circular queue ensures that the front of the queue will not be modified - Package *sPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(!sPackage->served) - { - sPackage->completed = false; - sPackage->served = true; - sPackage->request = MPI_REQUEST_NULL; - int ns = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].ns; - int nd = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].nd; - int r_gid = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_gid; - int r_grids = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_grids; - int tag = Perilla::myTagMap[r_gid][nd][ns][graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz]; - sPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - } - } - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) > 0) - { - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true); - //Note: all messages before rear have completed - if(rearPackage) - if(!rearPackage->completed) - { - bool flag = false; - int ret_flag=0; - MPI_Status status; - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - - flag = (ret_flag == 0) ? false : true; - if(flag) - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - rearPackage->completeRequest(true); - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) - { - cpDst->r_con.firingRuleCnt++; - } - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } - } // if(pQueue.queueSize(true) > 0) - } // for (ir_con.nrcv) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) >0) - { - Package *frontPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served) - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true; - if(flag) - { - //we have to lock the queue before removing the front - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - frontPackage = cpSrc->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - cpSrc->r_con.snd[i].recycleQueue.enqueue(frontPackage, true); - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(f -#include -#include - -namespace perilla{ - -struct _workerThreadInfo{ - int _tid; //thread id in local group - int _size; //number of threads in the group -}; - -struct _threadInfo{ - bool _isComm; //whether this thread handles communication - int _wtid; //worker thread id (-1 if this thread is decicated to communication) - int _nWts; //number of thread groups -}; - -class RTS -{ - private: - int _nWrks; - void RTS_Init(); - int _rank, _nProcs; - - public: - RTS(){ - _nWrks=1; - char* nWrks= getenv("NWORKERS"); - if(nWrks) _nWrks= atoi(nWrks); - } - RTS(int nWrks):_nWrks(nWrks){} - int ProcCount(); - int MyProc(); - int WorkerThreadCount(); - int MyWorkerThread(); - void Init(); //Build the runtime system from scratch - void Init(int rank, int nProcs);//Build the runtime system on pre-existing MPI processes - void Iterate(void *graph, int max_step, Real stop_time); - void Finalize(); - void Barrier(); - void runAMR(Amr* amrptr, int max_step, Real stop_time); - void invokeOnDemand(std::vector rg, RGIter *rgi); -}; - -} -#endif diff --git a/Src/AmrTask/rts_impls/mpi/PerillaRts.cpp b/Src/AmrTask/rts_impls/mpi/PerillaRts.cpp deleted file mode 100644 index f101bd5e4b0..00000000000 --- a/Src/AmrTask/rts_impls/mpi/PerillaRts.cpp +++ /dev/null @@ -1,195 +0,0 @@ -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 08-14-2017 -#include -#include -#include -#include -#include -#include -#include "PerillaRts.H" - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include -PerillaMemCheck memcheck; -#endif - -#include -#include -using namespace std; -#include - -namespace perilla{ - Amr* amrptr; - struct RtsDomain{ - pthread_t *_threads; - int _size; - MyLock _lock; - RtsDomain():_threads(NULL), _size(0){}; - ~RtsDomain(){ - free(_threads); - } - }; - int numa_nodes; - RtsDomain *dom; - MyLock _l; - volatile char startSignal=0; - pthread_mutex_t startLock= PTHREAD_MUTEX_INITIALIZER; - - int RTS::ProcCount(){ - return _nProcs; - } - - int RTS::MyProc(){ - return _rank; - } - - int RTS::WorkerThreadCount(){ - return _nWrks; - } - - int RTS::MyWorkerThread(){ - return 0; - } - - struct argT { - int numaID; - int tid; - int g_tid; - int nThreads; - int nTotalThreads; - int max_step; - Real stop_time; - RTS* thisRTS; - }; - - void RTS::runAMR(Amr* amr, int max_step, Real stop_time){ - while (amr->okToContinue() && - (amr->levelSteps(0) < max_step || max_step < 0) && - (amr->cumTime() < stop_time || stop_time < 0.0) ) - - { - // Do a coarse timestep, which calls one or multiple timestep updates (i.e. timeStep()) at each AMR level - amr->coarseTimeStep(stop_time); - } - } - -#ifdef USE_PERILLA_PTHREADS - void run(void* threadInfo){ - argT *args= (argT*)threadInfo; - int numaID= args->numaID; - int tid= args->tid; - int g_tid= args->g_tid; - int nThreads= args->nThreads; - int nTotalThreads= args->nTotalThreads; - int max_step= args->max_step; - Real stop_time= args->stop_time; - RTS* rts= args->thisRTS; - Perilla::registerId(g_tid); - //done with thread id setup, now wait for the start signal from master - pthread_mutex_lock(&startLock); - startSignal++; - pthread_mutex_unlock(&startLock); - while(startSignal!= nTotalThreads){} - rts->runAMR(amrptr, max_step, stop_time); - } -#endif - - void InitializeMPI(){ - int provided; - MPI_Init_thread(0, 0, MPI_THREAD_FUNNELED, &provided); - if(provided == MPI_THREAD_SINGLE){//with this MPI, process can't spawn threads - cerr << "Spawning threads is not allowed by the MPI implementation" << std::endl;; - } - } - - void RTS::RTS_Init(){ - amrptr= NULL; - } - - void RTS::Init(){ - InitializeMPI(); - MPI_Comm_rank(MPI_COMM_WORLD, &_rank); - MPI_Comm_size(MPI_COMM_WORLD, &_nProcs); - RTS_Init(); - } - - void RTS::Init(int rank, int nProcs){ - _rank= rank; - _nProcs= nProcs; - RTS_Init(); - } - - void RTS::Finalize(){ -#ifdef PERILLA_DEBUG - memcheck.report(); -#endif - } - - void RTS::Iterate(void* amrGraph, int max_step, Real stop_time){ - assert(amrGraph); - Perilla::max_step= max_step; - amrptr= (Amr*)amrGraph; - WorkerThread::init(); -#ifndef USE_PERILLA_PTHREADS - runAMR(amrptr, max_step, stop_time); -#else - int numa_nodes= perilla::NUM_THREAD_TEAMS; - int worker_per_numa = perilla::NUM_THREADS_PER_TEAM; - int _nWrks= numa_nodes*worker_per_numa; - int base=0; - int localID=-1; - //create a list of persistent threads for each NUMA node - cpu_set_t cpuset; - pthread_attr_t attr; - pthread_attr_init(&attr); - dom= new RtsDomain[numa_nodes]; - for(int i=0; inumaID= domNo; - arg->tid= localID; - arg->g_tid= domNo*worker_per_numa+localID; - arg->nThreads= worker_per_numa; - arg->nTotalThreads= _nWrks; - arg->thisRTS= this; - arg->max_step= max_step; - arg->stop_time= stop_time; - int err = pthread_create(&(dom[domNo]._threads[localID]), &attr, (void*(*)(void*))run, arg); - }else{ //master thread - dom[domNo]._threads[localID]= pthread_self(); - Perilla::registerId(0); - //enable worker threads to start computing - pthread_mutex_lock(&startLock); - startSignal++; - pthread_mutex_unlock(&startLock); - } - dom[domNo]._size++; - if(localID == (worker_per_numa-1)){ - localID=-1; - base+= worker_per_numa; - } - } - while(startSignal!= _nWrks){}//wait until all threads have done the setup phase - runAMR(amrptr, max_step, stop_time); - for(int i=1; i<_nWrks; i++) pthread_join(dom[i/worker_per_numa]._threads[i%worker_per_numa], NULL); -#endif - } - - void RTS::Barrier(){ - MPI_Barrier(MPI_COMM_WORLD); - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/mpi/perilla.mak b/Src/AmrTask/rts_impls/mpi/perilla.mak deleted file mode 100755 index 9e1c88ce260..00000000000 --- a/Src/AmrTask/rts_impls/mpi/perilla.mak +++ /dev/null @@ -1,10 +0,0 @@ -CEXE_sources += PackageQueue.cpp -CEXE_sources += Perilla.cpp -CEXE_sources += WorkerThread.cpp - - -CEXE_headers += Config.H -CEXE_headers += PackageQueue.H - - - diff --git a/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.H b/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.H deleted file mode 100755 index 66c239f0f2f..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.H +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _AsyncMultiFabUtil_H_ -#define _AsyncMultiFabUtil_H_ - -#include -#include -#include -#include - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const int ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const int ratio, int f); - -// Average fine cell-based MultiFab onto crse cell-centered MultiFab without volume weighting. -// This routine DOES NOT assume that the crse BoxArray is a coarsened version of the fine BoxArray. - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int ratio, int f); - -void average_down_push (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, MultiFab* crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - amrex::Geometry&, amrex::Geometry&,int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, - amrex::Geometry&, amrex::Geometry&, int scomp, int ncomp, const IntVect& ratio, int f); - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.cpp b/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.cpp deleted file mode 100755 index e86ff112537..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/AsyncMultiFabUtil.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include -//#include -#include -#include -#include -#include - -using namespace amrex; -using namespace perilla; - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, int rr, int f) -{ - average_down_push(amr,S_fine,S_crse,crse_S_fine,RG_fine,RG_crse,fgeom,cgeom,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, -const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, int rr, int f) -{ - average_down_pull(S_fine,S_crse,RG_fine,RG_crse,fgeom,cgeom,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, const IntVect& ratio, int f) -{ - if (S_fine.is_nodal() || S_crse.is_nodal()) - { - amrex::Error("Can't use amrex::average_down for nodal MultiFab!"); - } - -#if (BL_SPACEDIM == 3) - average_down_push(amr, S_fine, S_crse, crse_S_fine, RG_fine, RG_crse, scomp, ncomp, ratio, f); - return; -#else - - assert(S_crse.nComp() == S_fine.nComp()); - - - MultiFab fvolume; - fgeom.GetVolume(fvolume, fine_BA, 0); - - int lfi = crse_S_fine.IndexArray()[f]; - const Box& tbx = crse_S_fine[ lfi ].box(); - - amrex_avgdown_with_vol(tbx,crse_S_fine[lfi].array(),S_fine[lfi].array(),fvolume[mfi].array(), - 0,scomp,ncomp,ratio); - - Perilla::multifabCopyPushAsync(RG_crse, RG_fine, &S_crse, &crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -#endif -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fgeom, const Geometry& cgeom, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - - if (S_fine.is_nodal() || S_crse.is_nodal()) - { - amrex::Error("Can't use amrex::average_down for nodal MultiFab!"); - } - -#if (BL_SPACEDIM == 3) - average_down_pull(S_fine, S_crse, RG_fine, RG_crse, scomp, ncomp, ratio, f); - return; -#else - assert(S_crse.nComp() == S_fine.nComp()); - Perilla::multifabCopyPull(RG_crse, RG_fine, &S_crse, &S_fine, f, scomp, 0, ncomp, 0, 0, false); -#endif -} - - -// ************************************************************************************************************* - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, int rr, int f) -{ - average_down_push(amr,S_fine,S_crse,crse_S_fine,RG_fine,RG_crse,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int rr, int f) -{ - average_down_pull(S_fine,S_crse,RG_fine,RG_crse,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - assert(S_crse.nComp() == S_fine.nComp()); - - // NOTE: The tilebox is defined at the coarse level. - int lfi = crse_S_fine.IndexArray()[f]; - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - - for(int t=0; tfabTiles[f]->numTiles; t++) - if(t % (perilla::NUM_THREADS_PER_TEAM-1) == nt) - { - const Box& tbx = *(RG_fine->fabTiles[f]->tileBx[t]); - amrex_avgdown(tbx,crse_S_fine[lfi].array(),S_fine[lfi].array(),0,scomp,ncomp,ratio); - } - RG_fine->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - Perilla::multifabCopyPushAsync(RG_crse, RG_fine, &S_crse, &crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - assert(S_crse.nComp() == S_fine.nComp()); - Perilla::multifabCopyPull(RG_crse, RG_fine, &S_crse, &S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - -void average_down_push (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, MultiFab* crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse,amrex::Geometry& geom, amrex::Geometry& geom1, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - if(rgi.currentItr != rgi.totalItr) - return; - - f = rgi.currentRegion; - // NOTE: The tilebox is defined at the coarse level. - int lfi = crse_S_fine->IndexArray()[f]; - - // NOTE: We copy from component scomp of the fine fab into component 0 of the crse fab - // because the crse fab is a temporary which was made starting at comp 0, it is - // not part of the actual crse multifab which came in. - - perilla::syncWorkerThreads(); - int nThreads= perilla::nWorkerThreads(); - for(int t=0; tfabTiles[f]->numTiles; t+= nThreads) - { - const Box& tbx = *(RG_fine->fabTiles[f]->tileBx[t]); - amrex_avgdown(tbx,(*crse_S_fine)[lfi].array(),(*S_fine)[lfi].array(),0,scomp,ncomp,ratio); - } - perilla::syncWorkerThreads(); - Perilla::multifabCopyPush(RG_crse, RG_fine, S_crse, crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - -void average_down_pull (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, amrex::Geometry& geom, amrex::Geometry& geom1, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - if(rgi.currentItr != 1) - return; - f = rgi.currentRegion; - - Perilla::multifabCopyPull(RG_crse, RG_fine, S_crse, S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - - -// ************************************************************************************************************* - -#if 0 -// Average fine face-based MultiFab onto crse fine-centered MultiFab. -// This routine assumes that the crse BoxArray is a coarsened version of the fine BoxArray. -void average_down_faces (PArray& fine, PArray& crse, IntVect& ratio) -{ - BL_ASSERT(crse.size() == BL_SPACEDIM); - BL_ASSERT(fine.size() == BL_SPACEDIM); - BL_ASSERT(crse[0].nComp() == fine[0].nComp()); - - int ncomp = crse[0].nComp(); - -#ifdef _OPENMP -#pragma omp parallel -#endif - for (int n=0; n - -class Barrier -{ -private: - volatile int counter; - int maxThreads; - volatile bool globalSense; -public: - Barrier(); - Barrier(int maxThreads); - void init(int maxThreads); - void sync(); - void sync(int numthreads); -}; -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/Barrier.cpp b/Src/AmrTask/rts_impls/mpi_omp/Barrier.cpp deleted file mode 100755 index 5409a7e43c1..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/Barrier.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -Barrier::Barrier() -{ - //With this intializer, numthreads has to be specified when syncing, i.e. sync(numthreads) - counter = INT_MAX; - globalSense = false; - maxThreads=INT_MAX; -} - -Barrier::Barrier(int numthreads) -{ -//With this initializer, both sync() and sync(numthreads) can be used -#pragma omp critical -{ - counter = numthreads; - maxThreads= numthreads; - globalSense = false; -} -} - -void Barrier::init(int numthreads) -{ -//Similar to Barrier(int numthreads) - counter = numthreads; - maxThreads= numthreads; - globalSense = false; -} - -void Barrier::sync() //sync all threads associated with this barrier -{ - assert(maxThreads -#include -#include -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif -using namespace amrex; - -class LocalCopyDescriptor -{ - public: - int ns; //Source box in layout - int nd; //Destination box in layout - int sz; - Box sbx; //Sub-box for this copy - Box dbx; //Sub-box for this copy - PackageQueue pQueue; //store incoming or outgoing messages, both fab and the runtime can access this queue - PackageQueue recycleQueue; //just for now, I'll replace this with a NUMA aware package allocator - int sPartner, dPartner; - int dcpyCnt,scpyCnt; - omp_lock_t ghostLock; - LocalCopyDescriptor() : ns(-1), nd(-1), scpyCnt(0), dcpyCnt(0), sz(0), sPartner(-1), dPartner(-1) { - omp_init_lock(&ghostLock); - } -}; - -class LocalConnection -{ - public: - int nscpy; //Number of cpy chunks - int ndcpy; //Number of cpy chunks - omp_lock_t sLock, dLock, ghostLock; - int firingRuleCnt; - int scpyCnt, dcpyCnt; - Barrier *localBarrier; - LocalCopyDescriptor *scpy; - LocalCopyDescriptor *dcpy; - LocalConnection() : nscpy(0), ndcpy(0), firingRuleCnt(0), scpy(NULL), dcpy(NULL), scpyCnt(0), dcpyCnt(0), localBarrier(NULL){ - omp_init_lock(&sLock); - omp_init_lock(&dLock); - omp_init_lock(&ghostLock); - } - - ~LocalConnection() - { - if(localBarrier) free(localBarrier); - if(scpy) free(scpy); - if(dcpy) free(dcpy); - } -}; -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/Make.package b/Src/AmrTask/rts_impls/mpi_omp/Make.package deleted file mode 100644 index a9cf8caa34d..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/Make.package +++ /dev/null @@ -1,11 +0,0 @@ -PERILLA_LIB=EXE - -C$(PERILLA_LIB)_sources += Barrier.cpp RGIter.cpp RegionQueue.cpp PackageQueue.cpp Perilla.cpp RegionGraph.cpp WorkerThread.cpp AsyncMultiFabUtil.cpp PerillaRts.cpp - -C$(PERILLA_LIB)_headers += Barrier.H Config.H LocalConnection.H PackageQueue.H RegionGraph.H RegionQueue.H RemoteConnection.H WorkerThread.H AsyncMultiFabUtil.H PerillaRts.H - -include $(AMREX_HOME)/Src/AmrTask/rts_impls/mpi_omp/perilla.mak -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/mpi_omp -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/mpi_omp - - diff --git a/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.H b/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.H deleted file mode 100755 index d521a4eff25..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.H +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef P_PACKAGEQUEUE_H -#define P_PACKAGEQUEUE_H - -#include -#include -#include - -class Package -{ -private: - int source, destination; -public: - double *databuf; - int bufSize; - omp_lock_t packageLock; - bool completed; //message transfer is done - bool served; //message transfer request has been served but may have not completed - bool notified; - MPI_Request request; //!for MPI - Package(); - ~Package(); - Package(int size); - Package(int src, int dest); - Package(int src, int dest, int size); - void setPackageSource(int src); - void setPackageDestination(int dest); - void completeRequest(void); - void completeRequest(bool lockIgnore); - bool checkRequest(void); - void generatePackage(int size); -}; - -class PackageQueue -{ -private: - Package *buffer[perilla::MSG_QUEUE_DEFAULT_MAXSIZE]; - int n; - int front; - int rear; - int prear; -public: - omp_lock_t queueLock; - PackageQueue(); - ~PackageQueue(); - int queueSize(void); - int queueSize(bool lockIgnore); - void enqueue(Package* package); - void enqueue(Package* package, bool lockIgnore); - Package* dequeue(void); - Package* dequeue(bool lockIgnore); - Package* getRear(void); - Package* getRear(bool lockIgnore); - Package* getFront(void); - Package* getFront(bool lockIgnore); - void emptyQueue(); -}; -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.cpp b/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.cpp deleted file mode 100755 index 83009415007..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/PackageQueue.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include -#include -using namespace perilla; -#ifdef PERILLA_DEBUG -#include -extern PerillaMemCheck memcheck; -#endif - -Package::Package() -{ - databuf = 0; - bufSize = 0; - source = 0; - destination = 0; - completed = false; - notified = false; - served = false; - request = MPI_REQUEST_NULL; - omp_init_lock(&packageLock); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::~Package() -{ - if(databuf) free(databuf); -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif -} - -Package::Package(int size) -{ - databuf = new double[size]; - bufSize = size; - source = 0; - destination = 0; - completed = false; - notified = false; - served = false; - request = MPI_REQUEST_NULL; - omp_init_lock(&packageLock); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest) -{ - bufSize = 0; - source = src; - destination = dest; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest, int size) -{ - source = src; - destination = dest; - databuf = new double[size]; - bufSize = size; - source = 0; - destination = 0; - completed = false; - notified = false; - served = false; - request = MPI_REQUEST_NULL; - omp_init_lock(&packageLock); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -void Package::setPackageSource(int src) -{ - source = src; -} - -void Package::setPackageDestination(int dest) -{ - destination = dest; -} - -void Package::completeRequest(void) -{ - omp_set_lock(&packageLock); - completed = true; - omp_unset_lock(&packageLock); -} - -void Package::completeRequest(bool lockIgnore) -{ - if(!lockIgnore)omp_set_lock(&packageLock); - completed = true; - if(!lockIgnore)omp_unset_lock(&packageLock); -} - -bool Package::checkRequest(void) -{ - return completed; -} - -void Package::generatePackage(int size) -{ - databuf = new double[size]; - bufSize = size; - source = 0; - destination = 0; - completed = false; - notified = false; - served = false; - request = MPI_REQUEST_NULL; - omp_init_lock(&packageLock); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -PackageQueue::PackageQueue() -{ - n = 0; - front = 0; - rear = 0; - prear = -1; - omp_init_lock(&queueLock); -} - -int PackageQueue::queueSize(void) -{ - int size; - omp_set_lock(&queueLock); - size = n; - omp_unset_lock(&queueLock); - return size; -} - -int PackageQueue::queueSize(bool lockIgnore) -{ - int size; - if(!lockIgnore)omp_set_lock(&queueLock); - size = n; - if(!lockIgnore)omp_unset_lock(&queueLock); - return size; -} - -void PackageQueue::enqueue(Package* package) -{ - omp_set_lock(&queueLock); - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - omp_unset_lock(&queueLock); -} - -void PackageQueue::enqueue(Package* package, bool lockIgnore) -{ - if(!lockIgnore)omp_set_lock(&queueLock); - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - if(!lockIgnore)omp_unset_lock(&queueLock); -} - -Package* PackageQueue::dequeue(void) -{ - Package* package = 0; - omp_set_lock(&queueLock); - package = buffer[front]; - front = (front+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n--; - omp_unset_lock(&queueLock); - return package; -} - -Package* PackageQueue::dequeue(bool lockIgnore) -{ - lockIgnore = false; - Package* package = 0; - if(!lockIgnore)omp_set_lock(&queueLock); - if(n<0) - std::cout<< "Q size " << n << " front " << front < -#include -#include -#include -#include - -using namespace std; - -namespace amrex -{ - -class Perilla -{ - static int tagGen(int src, int dest, int channelID, int nFabs, int nChannels); - - public: - static int uTags; - static bool genTags; - static int max_step; - - static std::map> pTagCnt; - static std::map>>>> tagMap; - static std::map>>> myTagMap; - static void clearTagMap(); - static void clearMyTagMap(); - static void communicateTags(); - - static volatile int numTeamsFinished; - static volatile int updateMetadata_request; - static volatile int updateMetadata_noticed; - static volatile int updateMetadata_done; - static Barrier * globalBarrier; - static void multifabBuildFabCon(RegionGraph* graph, const MultiFab& mf, const Periodicity& period); - static void serviceLocalRequests(RegionGraph *graph, int tg); - static void serviceRemoteRequests(RegionGraph *graph, int graphID, int nGraphs); - static void serviceRemoteRequests(RegionGraph *graph); - static void serviceSingleGraphComm(RegionGraph* graph, int tid); - static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, bool cpyAcross, int tid); - static void serviceMultipleGraphCommDynamic(std::vector graphArray, bool cpyAcross, int tid); - static void flattenGraphHierarchy(std::vector >graphArray, std::vector &flattenedGraphArray); - static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, int tid); - static void fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f); - static void fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f); - static void fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT); - - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, int nc, int ng, int ngSrc, const Periodicity& period); - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, const Periodicity& period); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, bool singleT); - - static void multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - - static void serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg); - static void serviceRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - static void resetRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - - static void fillBoundaryPush(amrex::RGIter& rgi, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, amrex::MultiFab& mf, bool singleT); - static void fillBoundaryPush(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf, bool singleT); - -}; // class Perilla - - -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/Perilla.cpp b/Src/AmrTask/rts_impls/mpi_omp/Perilla.cpp deleted file mode 100755 index ade1cd8c6b3..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/Perilla.cpp +++ /dev/null @@ -1,2943 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -using namespace amrex; -using namespace perilla; - -volatile int Perilla::numTeamsFinished = 0; -volatile int Perilla::updateMetadata_request = 0; -volatile int Perilla::updateMetadata_noticed = 0; -volatile int Perilla::updateMetadata_done = 0; -int Perilla::max_step=1; -std::map> Perilla::pTagCnt; -int Perilla::uTags=0; -bool Perilla::genTags=true; -std::map > > > > Perilla::tagMap; -std::map > > > Perilla::myTagMap; - -void Perilla::clearTagMap(){ - Perilla::tagMap.clear(); -} - -void Perilla::clearMyTagMap(){ - Perilla::myTagMap.clear(); -} - -void Perilla::communicateTags() -{ - int myProc = ParallelDescriptor::MyProc(); - int nPs = ParallelDescriptor::NProcs(); - typedef std::map tags_t; - typedef std::map> stags_t; - typedef std::map>> dstags_t; - typedef std::map>>> gdstags_t; - typedef std::map>>>> pgdstags_t; - - int** tags = new int*[nPs]; - int** rtags = new int*[nPs]; - int* rTagCnt = new int[nPs*2]; - int* sTagCnt = new int[nPs*2]; - - MPI_Request *srrequest; - srrequest = new MPI_Request[nPs]; - MPI_Request *ssrequest; - ssrequest = new MPI_Request[nPs]; - MPI_Request *trrequest; - trrequest = new MPI_Request[nPs]; - MPI_Request *tsrequest; - tsrequest = new MPI_Request[nPs]; - - std::vector proc_communicated; - - proc_communicated.resize(nPs); - for(int p=0; psecond.begin(); it2 != it1->second.end(); it2++) - { - tac++; - tac++; - ng++; - for(dstags_t::iterator it3 = it2->second.begin(); it3 != it2->second.end(); it3++) - for(stags_t::iterator it4 = it3->second.begin(); it4 != it3->second.end(); it4++) - for(tags_t::iterator it5 = it4->second.begin(); it5 != it4->second.end(); it5++) - { - tac+=4; - } - } - sTagCnt[it1->first*2] = tac; - sTagCnt[it1->first*2+1] = ng; - tags[it1->first] = new int[sTagCnt[it1->first*2]]; - MPI_Isend(&sTagCnt[it1->first*2], 2, MPI_INT, it1->first, 0, MPI_COMM_WORLD, &ssrequest[it1->first]); - proc_communicated[it1->first]=true; - } - - for(int p=0; psecond.begin(); it2 != it1->second.end(); it2++) - { - tags[it1->first][tac++] = it2->first; - tags[it1->first][tac++] = pTagCnt[it1->first][it2->first]; - int gtagc = 0; - for(dstags_t::iterator it3 = it2->second.begin(); it3 != it2->second.end(); it3++) - for(stags_t::iterator it4 = it3->second.begin(); it4 != it3->second.end(); it4++) - for(tags_t::iterator it5 = it4->second.begin(); it5 != it4->second.end(); it5++) - { - tags[it1->first][tac++] = it3->first; - tags[it1->first][tac++] = it4->first; - tags[it1->first][tac++] = it5->first; - tags[it1->first][tac++] = it5->second; - gtagc++; - } - BL_ASSERT(pTagCnt[it1->first][it2->first] == gtagc); - } - MPI_Isend(tags[it1->first], tac, MPI_INT, it1->first, 1, MPI_COMM_WORLD, &tsrequest[it1->first]); - } - - - MPI_Status status; - for(int p=0; p 0) - { - rtags[p] = new int[rTagCnt[p*2]]; - MPI_Irecv(rtags[p], rTagCnt[p*2], MPI_INT, p , 1, MPI_COMM_WORLD, &trrequest[p]); - } - } - } - - // //MPI_Irecv(size) Wait - - - //MPI_recive tags arra - for(int p=0; p 0) - { - MPI_Wait( &trrequest[p], &status ); - int tCnt=0; - for(int g=0; g 0) - { - delete[] rtags[p]; - } - } - - - for(int p=0; p perilla::MAX_SQRT_TAG) maxRange= 1024; - return (src%maxRange)*maxRange + (dest%maxRange) + channelID*(perilla::MAX_SQRT_TAG*(perilla::MAX_SQRT_TAG+1)/nChannels); - - //int nfabs = 256; - - //if(src >= nfabs || dest>=nfabs) - // std::cout<<"Warnig Tag" << src << " " << dest << " "<= MPI_TAG_UB ) - std::cout << "Out of Bound tag " << (src%maxRange)*maxRange + (dest%maxRange) + channelID*(maxSR*(maxSR+1)/nChannels) << " " <size(); - const int n_snds_mf = TheFB.m_SndTags->size(); - const int n_rcvs_mf = TheFB.m_RcvTags->size(); - - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(n_snds_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_SndTags->begin(), - m_End = TheFB.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - Vector recv_cctc; - Vector recv_pr; - recv_cctc.reserve(n_rcvs_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_RcvTags->begin(), - m_End = TheFB.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } -#pragma omp parallel shared(rg, mf, numfabs, np, TheFB, recv_cctc, send_cctc) - { - //int tg = WorkerThread::perilla_wid(); - int fg; - //if(WorkerThread::perilla_isCommunicationThread()) -#pragma omp single - { - //bool cc = !mf->is_nodal(); // cc = multifab_cell_centered_q(mf) - //mf->sMap.reserve(numfabs); - //mf->rMap.reserve(numfabs); - //std::cout<< "Allocating sMap and rMap" <alloc_lMap(mf); - rg->alloc_sMap(mf); - rg->alloc_rMap(mf); - } -#pragma omp barrier - //if(tid==0) - { - //bool cc = !mf->is_nodal(); // cc = multifab_cell_centered_q(mf) - //mf->sMap.reserve(numfabs); - //mf->rMap.reserve(numfabs); -#pragma omp for - for(int f=0; flMap[f]->l_con.nscpy = 0; - - //for(int i=0; il_con.ncpy; i++) - for(int i=0; il_con.cpy[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == tag.srcIndex) - rg->lMap[f]->l_con.nscpy++; - //if(f == local_index(mf,bxasc->l_con.cpy[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == tag.dstIndex) - rg->lMap[f]->l_con.ndcpy++; - } - /* - if(rg->lMap[f]->l_con.nscpy+rg->lMap[f]->l_con.ndcpy != n_loc_mf) - std::cout<< "Diff in Sum " << rg->lMap[f]->l_con.nscpy << " " <lMap[f]->l_con.ndcpy << " " << n_loc_mf <lMap[f]->l_con.nscpy+rg->lMap[f]->l_con.ndcpy == n_loc_mf); - */ - } - } - } -#pragma omp barrier - //now we know how many copying segments each fab owns as source and destination allocate memory for metadata -#pragma omp for - for(int f=0; flMap[f]->l_con.sLock)); - //omp_init_lock(&(rg->lMap[f]->l_con.dLock)); - //omp_init_lock(&(rg->lMap[f]->l_con.ghostLock)); - - //std::cout<< "MF l_con nscpy " <lMap[f]->l_con.nscpy << " ndcpy " << rg->lMap[f]->l_con.ndcpy <lMap[f]->l_con.scpy = new LocalCopyDescriptor[rg->lMap[f]->l_con.nscpy]; - rg->lMap[f]->l_con.dcpy = new LocalCopyDescriptor[rg->lMap[f]->l_con.ndcpy]; - rg->lMap[f]->l_con.scpyCnt = 0; - rg->lMap[f]->l_con.dcpyCnt = 0; - } - } -#pragma omp barrier - if(np > 1) - { -#pragma omp for - for(int f=0; flMap[f]->r_con.nrcv = 0; - rg->lMap[f]->r_con.nsnd = 0; - rg->lMap[f]->r_con.firingRuleCnt = 0; - - //for(int i=0; ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex) - { - rg->lMap[f]->r_con.nsnd++; - } - } - } - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - rg->lMap[f]->r_con.nrcv++; - } - } - } - //rg->sMap[f]->r_con.sndLock = new omp_lock_t; - //rg->rMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->sMap[f]->r_con.sndLock); - //omp_init_lock(rg->rMap[f]->r_con.rcvLock); - rg->lMap[f]->r_con.snd = new RemoteCommDescriptor[rg->lMap[f]->r_con.nsnd]; - rg->lMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->lMap[f]->r_con.nrcv]; - } - } - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - { -#pragma omp for - for(int f=0; frMap[f]->r_con.nrcv = 0; - rg->sMap[f]->r_con.nsnd = 0; - - //for(int i=0; ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex) - { - rg->sMap[f]->r_con.nsnd++; - } - } - } - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - rg->rMap[f]->r_con.nrcv++; - } - } - } - //rg->sMap[f]->r_con.sndLock = new omp_lock_t; - //rg->rMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->sMap[f]->r_con.sndLock); - //omp_init_lock(rg->rMap[f]->r_con.rcvLock); - rg->sMap[f]->r_con.snd = new RemoteCommDescriptor[rg->sMap[f]->r_con.nsnd]; - rg->rMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->rMap[f]->r_con.nrcv]; - } - } - } - } // omp parallel - //std::cout<< "counting done " <lMap[f]->l_con.localBarrier = new Barrier(perilla::NUM_THREADS_PER_TEAM-1); - // !create local communication meta data for sources and destinations - scnt = -1; - dcnt = -1; - //for(int i=0; il_con.ncpy; i++) - for(int i=0; il_con.cpy[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == tag.srcIndex) - { - scnt++; - //omp_init_lock(&(rg->lMap[f]->l_con.scpy[scnt].ghostLock)); - rg->lMap[f]->l_con.scpy[scnt].ns = mf.localindex(tag.srcIndex); //local_index(mf,bxasc->l_con.cpy[i].ns); //LocalIndex - rg->lMap[f]->l_con.scpy[scnt].nd = mf.localindex(tag.dstIndex); //local_index(mf,bxasc->l_con.cpy[i].nd); //LocalIndex - rg->lMap[f]->l_con.scpy[scnt].sbx = tag.sbox; //bxasc->l_con.cpy[i].sbx; - rg->lMap[f]->l_con.scpy[scnt].dbx = tag.dbox; //bxasc->l_con.cpy[i].dbx; - // !create queues for ghost cells - //call queue_init(mf%fbs(f)%l_con%scpy(scnt)%pQueue) - //call queue_init(mf%fbs(f)%l_con%scpy(scnt)%recycleQueue) - int psize = tag.sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%l_con%scpy(scnt)%sbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - for(int p=0; pdatabuf[j] = 0; - rg->lMap[f]->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->l_con.scpy[scnt].recycleQueue.enqueue(rg->lMap[f]->l_con.scpy[scnt].pQueue.dequeue()); - } - //if(f == local_index(mf,bxasc->l_con.cpy[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == tag.dstIndex) - { - dcnt++; - rg->lMap[f]->l_con.dcpy[dcnt].ns = mf.localindex(tag.srcIndex); //local_index(mf,bxasc->l_con.cpy[i].ns); //LocalIndex - rg->lMap[f]->l_con.dcpy[dcnt].nd = mf.localindex(tag.dstIndex); //local_index(mf,bxasc->l_con.cpy[i].nd); //LocalIndex - rg->lMap[f]->l_con.dcpy[dcnt].sbx = tag.sbox; //bxasc->l_con.cpy[i].sbx; - rg->lMap[f]->l_con.dcpy[dcnt].dbx = tag.dbox; //bxasc->l_con.cpy[i].dbx; - //call queue_init(mf%fbs(f)%l_con%dcpy(dcnt)%pQueue) - //call queue_init(mf%fbs(f)%l_con%dcpy(dcnt)%recycleQueue) - int psize = tag.dbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%l_con%dcpy(dcnt)%dbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - - for(int p=0; pdatabuf[j] = 0; - rg->lMap[f]->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->l_con.dcpy[dcnt].recycleQueue.enqueue(rg->lMap[f]->l_con.dcpy[dcnt].pQueue.dequeue()); - } - } // for(ilMap[f]->l_con.nscpy; i++) - for(int j=0; jlMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.ndcpy; j++) - if(rg->lMap[f]->l_con.scpy[i].dbx == rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[j].dbx) - rg->lMap[f]->l_con.scpy[i].dPartner = j; - - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - for(int j=0; jlMap[rg->lMap[f]->l_con.dcpy[i].ns]->l_con.nscpy; j++) - if(rg->lMap[f]->l_con.dcpy[i].dbx == rg->lMap[rg->lMap[f]->l_con.dcpy[i].ns]->l_con.scpy[j].dbx) - rg->lMap[f]->l_con.dcpy[i].sPartner = j; - } - } - - if(np == 1) return; - - //std::cout<< "local init done" <lMap[f]->r_con.sndLock = new omp_lock_t; - //rg->lMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->lMap[f]->r_con.sndLock); - //omp_init_lock(rg->lMap[f]->r_con.rcvLock); - //rg->lMap[f]->r_con.snd = new RemoteCommDescriptor[rg->lMap[f]->r_con.nsnd]; - //rg->lMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->lMap[f]->r_con.nrcv]; - nrcv= -1; - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - nrcv++; - rg->lMap[f]->r_con.rcv[nrcv].ns = it->srcIndex; //bxasc->r_con.rcv[i].ns; - //rg->lMap[f]->r_con.rcv[nrcv].lnd = ; //local_index(mf,bxasc->r_con.rcv[i].nd); // not used anywhere so deferred ---------???????? - //rg->lMap[f]->r_con.rcv[nrcv].lns = -1; //undefined - rg->lMap[f]->r_con.rcv[nrcv].nd = it->dstIndex; //bxasc->r_con.rcv[i].nd; - rg->lMap[f]->r_con.rcv[nrcv].lnd = mf.localindex(it->dstIndex); - rg->lMap[f]->r_con.rcv[nrcv].lns = mf.localindex(it->srcIndex); - rg->lMap[f]->r_con.rcv[nrcv].sbx = it->sbox; //bxasc->r_con.rcv[i].sbx; - rg->lMap[f]->r_con.rcv[nrcv].dbx = it->dbox; //bxasc->r_con.rcv[i].dbx; - rg->lMap[f]->r_con.rcv[nrcv].pr = pr; //bxasc->r_con.rcv[i].pr; - rg->lMap[f]->r_con.rcv[nrcv].cnt = 0; - //!create queues for ghost cells - //call queue_init(mf%fbs(f)%r_con%rcv(nrcv)%pQueue) - //call queue_init(mf%fbs(f)%r_con%rcv(nrcv)%recycleQueue) - int psize = it->sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%r_con%rcv(nrcv)%dbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - rg->lMap[f]->r_con.rcv[nrcv].sz = psize; - for(int p=0; pdatabuf[j] = 0; - rg->lMap[f]->r_con.rcv[nrcv].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->r_con.rcv[nrcv].recycleQueue.enqueue(rg->lMap[f]->r_con.rcv[nrcv].pQueue.dequeue()); - } - } - } // for(ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex ) - { - nsnd++; - rg->lMap[f]->r_con.snd[nsnd].ns = it->srcIndex; //bxasc->r_con.snd[i].ns; - rg->lMap[f]->r_con.snd[nsnd].nd = it->dstIndex; //bxasc->r_con.snd[i].nd; - //rg->lMap[f]->r_con.snd[nsnd].lns = ; //local_index(mf,bxasc->r_con.snd[i].ns); //not used anywhere so deferred ------????????? - //rg->lMap[f]->r_con.snd[nsnd].lnd = -1; //undefined - rg->lMap[f]->r_con.snd[nsnd].lns = mf.localindex(it->srcIndex); - rg->lMap[f]->r_con.snd[nsnd].lnd = mf.localindex(it->dstIndex); - rg->lMap[f]->r_con.snd[nsnd].sbx = it->sbox; //bxasc->r_con.snd[i].sbx; - rg->lMap[f]->r_con.snd[nsnd].dbx = it->dbox; //bxasc->r_con.snd[i].dbx; - rg->lMap[f]->r_con.snd[nsnd].pr = pr; //bxasc->r_con.snd[i].pr; - rg->lMap[f]->r_con.snd[nsnd].cnt = 0; - //!create queues for ghost cells - //call queue_init(mf%fbs(f)%r_con%snd(nsnd)%pQueue) - //call queue_init(mf%fbs(f)%r_con%snd(nsnd)%recycleQueue) - int psize = it->sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%r_con%snd(nsnd)%sbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - rg->lMap[f]->r_con.snd[nsnd].sz = psize; - for(int p=0; pdatabuf[j] = 0; - rg->lMap[f]->r_con.snd[nsnd].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->r_con.snd[nsnd].recycleQueue.enqueue(rg->lMap[f]->r_con.snd[nsnd].pQueue.dequeue()); - - //std::cout<< "RQ f "<< f << " i "<< nsnd <lMap[f]->r_con.nsnd; i++) - { - rg->sMap[f]->r_con.snd[i].ns = rg->lMap[f]->r_con.snd[i].ns; - rg->sMap[f]->r_con.snd[i].nd = rg->lMap[f]->r_con.snd[i].nd; - rg->sMap[f]->r_con.snd[i].lns = rg->lMap[f]->r_con.snd[i].lns; - rg->sMap[f]->r_con.snd[i].lnd = rg->lMap[f]->r_con.snd[i].lnd; - rg->sMap[f]->r_con.snd[i].r_gid = rg->graphID-1; - rg->sMap[f]->r_con.snd[i].r_grids = rg->numFabs; - rg->sMap[f]->r_con.snd[i].sbx = rg->lMap[f]->r_con.snd[i].sbx; - rg->sMap[f]->r_con.snd[i].dbx = rg->lMap[f]->r_con.snd[i].dbx; - rg->sMap[f]->r_con.snd[i].pr = rg->lMap[f]->r_con.snd[i].pr; - rg->sMap[f]->r_con.snd[i].sz = rg->lMap[f]->r_con.snd[i].sz; - rg->sMap[f]->r_con.snd[i].cnt = 0; - rg->lMap[f]->r_con.snd[i].cnt = 0; - - for(int p=0; plMap[f]->r_con.snd[i].sz); - for(int j=0; jlMap[f]->r_con.snd[i].sz; j++) - tmpPkg->databuf[j] = 0; - rg->sMap[f]->r_con.snd[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; psMap[f]->r_con.snd[i].recycleQueue.enqueue(rg->sMap[f]->r_con.snd[i].pQueue.dequeue()); - } - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - rg->rMap[f]->r_con.rcv[i].ns = rg->lMap[f]->r_con.rcv[i].ns; - rg->rMap[f]->r_con.rcv[i].nd = rg->lMap[f]->r_con.rcv[i].nd; - rg->rMap[f]->r_con.rcv[i].lns = rg->lMap[f]->r_con.rcv[i].lns; - rg->rMap[f]->r_con.rcv[i].lnd = rg->lMap[f]->r_con.rcv[i].lnd; - rg->rMap[f]->r_con.rcv[i].r_gid = rg->graphID-1; - rg->rMap[f]->r_con.rcv[i].r_grids = rg->numFabs; - rg->rMap[f]->r_con.rcv[i].sbx = rg->lMap[f]->r_con.rcv[i].sbx; - rg->rMap[f]->r_con.rcv[i].dbx = rg->lMap[f]->r_con.rcv[i].dbx; - rg->rMap[f]->r_con.rcv[i].pr = rg->lMap[f]->r_con.rcv[i].pr; - rg->rMap[f]->r_con.rcv[i].sz = rg->lMap[f]->r_con.rcv[i].sz; - rg->rMap[f]->r_con.rcv[i].cnt = 0; - rg->lMap[f]->r_con.rcv[i].cnt = 0; - - if(Perilla::genTags) - { - try - { - int rcv_pr = rg->rMap[f]->r_con.rcv[i].pr; - int dstIndex = rg->rMap[f]->r_con.rcv[i].nd; - int srcIndex = rg->rMap[f]->r_con.rcv[i].ns; - int psize = rg->rMap[f]->r_con.rcv[i].sz; - std::map::iterator itr = tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex].find(psize); - if( itr != tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex].end()) - { - //rg->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex][psize] = Perilla::uTags++; - //rg->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[rcv_pr].find(rg->graphID-1); - if(itr2 != pTagCnt[rcv_pr].end()) - pTagCnt[rcv_pr][rg->graphID-1] = pTagCnt[rcv_pr][rg->graphID-1] + 1; - else - pTagCnt[rcv_pr][rg->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< rg->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[rcv_pr][rg->graphID][it->dstIndex][it->srcIndex] = pTagCnt[rcv_pr]; - - for(int p=0; plMap[f]->r_con.rcv[i].sz); - for(int j=0; jlMap[f]->r_con.rcv[i].sz; j++) - tmpPkg->databuf[j] = 0; - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; prMap[f]->r_con.rcv[i].recycleQueue.enqueue(rg->rMap[f]->r_con.rcv[i].pQueue.dequeue()); - } - } - }// if(tid==0) - - }// omp parallel -}// multifabBuildFabCon - -void Perilla::serviceLocalRequests(RegionGraph* rg, int tg) -{ - int numfabs = rg->lMap.size(); - - for(int f=0; fgraphID <lMap[f]->l_con.sLock)); - //if(lockSucceeded != 0) // 0-Fail, otherwise-Succeed - { - //if(graph->graphID == 1) - //if(tg == 0) - //std::cout<<"I am tg " << tg << " processing " << f << " in Graph " << graph->graphID <graphID == 1 && (f == 2 || f == 1) ) - { - std::cout<< "serviceLR for gID 1 f " << f << " nscpy "<< rg->lMap[f]->l_con.nscpy << std::endl; - for(int i=0; ilMap[f]->l_con.nscpy; i++) - std::cout<< " " << rg->lMap[f]->l_con.scpy[i].nd << " " << rg->lMap[f]->l_con.scpy[i].dPartner << " " << rg->lMap[f]->l_con.scpy[i].pQueue.queueSize(); - std::cout<< std::endl; - }*/ - for(int i=0; ilMap[f]->l_con.nscpy; i++){ - - //std::cout<< "serviceLR nscpy " << rg->lMap[f]->l_con.nscpy <graphID == 1 && rg->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout<< "Processing gID 1 nd 1 from f " << f << " i " << i << std::endl; - - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize()>0) - { - omp_set_lock(&(rg->lMap[f]->l_con.sLock)); - Package *sPackage = rg->lMap[f]->l_con.scpy[i].pQueue.dequeue(); - if(perilla::LAZY_PUSH) - { - // Implemetation deffered. Currently not required - } - //if(graph->graphID == 1 && rg->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout<< "Processing gID 1 nd 1 from f " << f << " i " << i << std::endl; - omp_set_lock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - int dPartner = rg->lMap[f]->l_con.scpy[i].dPartner; - - //if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.queueSize() == 0 ) - if(dPartner == -1) - std::cout<< " Caution rQ size dPrtn "<< rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.ndcpy << " " << dPartner <<" graph ID " <graphID<lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.queueSize() <lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - - //for(int j=0; jbufSize; j++) - //dPackage->databuf[j] = sPackage->databuf[j]; //copy data------------------------------??????????????? - - std::memcpy(dPackage->databuf, sPackage->databuf, dPackage->bufSize * sizeof(double)); - - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.queueSize(true)==1) - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.firingRuleCnt++; - //if(graph->graphID == 1 && rg->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "gID 1 frc " << rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.firingRuleCnt << " df " << rg->lMap[f]->l_con.scpy[i].nd <lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - - //if(graph->graphID == 1) - //std::cout<< "Processed gID 1 f " << rg->lMap[f]->l_con.scpy[i].nd << std::endl; - - rg->lMap[f]->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - }} - omp_unset_lock(&(rg->lMap[f]->l_con.sLock)); -#pragma omp flush - }// if(!lock succeedded) - if(perilla::LAZY_PUSH) - { - // Implemetation deffered. Currently not required - } - }// if(tg==fg) - }// for(frMap.size(); - - // !we first post send and receive - for(int f=0; frMap[f]->r_con.rcvLock); - //if(lockSucceeded != 0) - { - //if(omp_test_lock(rg->lMap[f]->r_con.rcvLock) != 0) - { - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - nextsReq = true; - else - { - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - if(rearPackage->completed && rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - nextsReq = true; - else //!expected message is still on the way - nextsReq = false; - } - if(nextsReq) //!take a message from recycle pool and post a receive - { - omp_set_lock((rg->rMap[f]->r_con.rcvLock)); - omp_set_lock((rg->lMap[f]->r_con.rcvLock)); - int ns = rg->rMap[f]->r_con.rcv[i].ns; - int nd = rg->rMap[f]->r_con.rcv[i].nd; - int lnd = rg->rMap[f]->r_con.rcv[i].lnd; - int r_grids = rg->rMap[f]->r_con.rcv[i].r_grids; - //!create a package to keep track of receive requests - Package *rMetaPackage = rg->rMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - Package *rPackage = rg->lMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //tag = tagGen(mf%rMap(f)%r_con%rcv(i)%ns, mf%rMap(f)%r_con%rcv(i)%nd, gid, parallel_nprocs()*nfabs(mf), ngr)---------?????? - //int tag = tagGen(rg->rMap[f]->r_con.rcv[i].ns, rg->rMap[f]->r_con.rcv[i].nd, graphID-1, np*numfabs, nGraphs); - int tag = tagMap[rg->rMap[f]->r_con.rcv[i].pr][graphID][nd][ns][rg->rMap[f]->r_con.rcv[i].sz]; - - rMetaPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - //rMetaPackage->request = parallel_irecv_dv(rpackage%ptr%dataBuf,mf%rMap(f)%r_con%rcv(i)%sz, mf%rMap(f)%r_con%rcv(i)%pr, tag) --------- ???? - rMetaPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - rg->rMap[f]->r_con.rcv[i].sz, - rg->rMap[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - omp_unset_lock((rg->lMap[f]->r_con.rcvLock)); - omp_unset_lock((rg->rMap[f]->r_con.rcvLock)); - } - } - //omp_unset_lock(rg->lMap[f]->r_con.rcvLock); - }// if(omp_test_lock) - //omp_unset_lock(rg->rMap[f]->r_con.rcvLock); - }// if(lockSucceeded) - }// for(fsMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //then !no message has been issued or all send requests have been fulfilled - nextrReq = false; - else - nextrReq = true; - - if(nextrReq) - { - Package *sMetaPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(!sMetaPackage->served) - { - Package *sPackage = rg->lMap[f]->r_con.snd[i].pQueue.getFront(true); - sMetaPackage->completed = false; - sMetaPackage->served = true; - sMetaPackage->request = MPI_REQUEST_NULL; - int ns = rg->sMap[f]->r_con.snd[i].ns; - int nd = rg->sMap[f]->r_con.snd[i].nd; - int r_gid = rg->sMap[f]->r_con.snd[i].r_gid; - int r_grids = rg->sMap[f]->r_con.snd[i].r_grids; - //tag = tagGen(mf%sMap(f)%r_con%snd(i)%ns, mf%sMap(f)%r_con%snd(i)%nd, gid, parallel_nprocs()*nfabs(mf), ngr) -??????? - //int tag = tagGen(rg->sMap[f]->r_con.snd[i].ns, rg->sMap[f]->r_con.snd[i].nd, graphID-1, np*numfabs, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][rg->sMap[f]->r_con.snd[i].sz]; - //int tag = myTagMap[graphID-1][rg->sMap[f]->r_con.snd[i].nd][rg->sMap[f]->r_con.snd[i].ns]; - //sMetaPackage%ptr%request = parallel_isend_dv(spackage%ptr%dataBuf,mf%sMap(f)%r_con%snd(i)%sz, mf%sMap(f)%r_con%snd(i)%pr, tag) --????? - sMetaPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - rg->sMap[f]->r_con.snd[i].sz, - rg->sMap[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - } - } - } // for(irMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - { - //if(omp_test_lock(rg->lMap[f]->r_con.rcvLock) != 0) // 0-Fail, otherwise-Succeed - { - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true); - if(!rearPackage->completed) - { - bool flag = false; - int ret_flag; - MPI_Status status; - - std::cout<< "myP "<< myProc << " f "<< f << " i "<< i<< " Req "<request << std::endl; - - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(rearPackage%ptr%request) -------??????? - if(flag) - { - omp_set_lock((rg->lMap[f]->r_con.rcvLock)); - rearPackage->completeRequest(); - rg->lMap[f]->r_con.rcv[i].pQueue.getRear()->completeRequest(); - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - rg->lMap[f]->r_con.firingRuleCnt++; - omp_unset_lock((rg->lMap[f]->r_con.rcvLock)); -#pragma omp flush - } - } - //omp_unset_lock(rg->lMap[f]->r_con.rcvLock); - } // if(omp_test_lock) - } // if(queueSize > 0) - } // for(ilMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - { - Package *frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served && !frontPackage->completed) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(frontPackage%ptr%request) -------??????? - if(flag) - { - omp_set_lock(rg->sMap[f]->r_con.sndLock); - frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - frontPackage->notified = false; - rg->sMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - omp_unset_lock(rg->sMap[f]->r_con.sndLock); -#pragma omp flush - omp_set_lock(rg->lMap[f]->r_con.sndLock); - frontPackage = rg->lMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - omp_unset_lock(rg->lMap[f]->r_con.sndLock); - } - } - } // if(queueSize > 0) - } // for(itotalFinishes < perilla::NUM_THREAD_TEAMS) - { - serviceLocalRequests(graph, tg); - if((np>1) & (tg==0)) - serviceRemoteRequests(graph); - } - else - { - if(tg==0) - { - while(graph->totalFinishes < perilla::NUM_THREAD_TEAMS) - { -#pragma omp flush (graph) - } - //call parallel_barrier() ---???????? - ParallelDescriptor::Barrier("serviceSingleGraph-1"); - graph->graphTeardown(); - graph->workerTeardown(); - //call parallel_barrier() ------????????? - ParallelDescriptor::Barrier("serviceSingleGraph-2"); - } - break; - } - } // while(true) - -} //serviceSingleGraphComm - -void Perilla::serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, bool cpyAcross, int tid) -{ - int tg = WorkerThread::perilla_wid(); - int np = ParallelDescriptor::NProcs(); - int graphFinishCnt = 0; - while(true) - { - for(int g=0; g 1) - if(tg==0) - { - serviceRemoteRequests(&graphArray[g],g,nGraphs); - //if(cpyAcross) - //serviceRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - } - } - } - //!check if we have finished all the graph execution - bool noMoreWork = true; - for(int g=0; g > graphArrayHierarchy, std::vector &graphArray){ - int gCnt=0; - for(int l=0; l graphArray, bool cpyAcross, int tid) -{ - int tg = WorkerThread::perilla_wid(); - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int graphFinishCnt = 0; - int nGraphs; - bool doublechecked = false; - - double maxltime=0; - double minltime=10; - double avgltime=0; - double numloops=0; - double ltime,lstime,letime; - - - //while(true) - { - //lstime = omp_get_wtime(); - for(int g=0; ggraphID==13) - //std::cout<<"Processing Local GridCopy Req Graph "<< g+1 << " tg " << tg <totalFinishes < perilla::NUM_THREAD_TEAMS) - { - /*try{ - if(graphArray[g]->assocMF == 0) - std::cout<<"Processing Graph with NULL MF "<graphID==1) - //std::cout<<"Processing Local Req Graph "<graphID==13) - //std::cout<<"Processing Local GridCopy Req Graph "<< g+1 << " tg " << tg < 1) - //if(tg==0) - { - serviceRemoteRequests(graphArray[g],g,nGraphs); - if(cpyAcross) - { - //resetRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - if(tg==0) - serviceRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - } - } - } - } - /* - //!check if we have finished all the graph execution - bool noMoreWork = true; - //std::cout<<"Graph Not Completed "; - for(int g=0; gtotalFinishes < perilla::NUM_THREAD_TEAMS) - { - noMoreWork = false; - //if(tg==0) - //std::cout<< g << " tfs " << graphArray[g]->totalFinishes << std::endl; - } - //else - // std::cout<<"Graph Completed "<< g < ::max(), '\n' ); - - //for(int g=0; gtotalFinishes << " | "; - - - //f( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - //{ - // if(doublechecked) // double check if there are still something to send - // break; - // else - // doublechecked = true; - //} - - //std::cout<<"Teams Completed "<< Perilla::numTeamsFinished << " tid "<< tid << " myProc " << myProc < maxltime) - // maxltime = ltime; - - } // while(true) - - //if(myProc==0) - //std::cout<< std::endl << "COMM HANDLER TIMES tg" << tg << " avg " << avgltime/numloops << " min " << minltime << " max " << maxltime <graphTeardown(tg); - //graphArray[g]->workerTeardown(tg); - //ParallelDescriptor::Barrier("serviceMultipleGraph-2"); - //} - -} // serviceMultipleGraphCommDynamic - - -void Perilla::serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, int tid) -{ - serviceMultipleGraphComm(graphArray,nGraphs,false,tid); -} // serviceMultipleGraphComm - -void Perilla::fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 ntid "<< ntid <lMap[f]->l_con.sLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::PACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.nscpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQenQ f " << f << " i " << i <lMap[f]->l_con.scpy[i].recycleQueue.getFront(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->l_con.scpy[i].sbx,0,nComp,sPackage->databuf); - - for(int d=0; dbufSize; d++) - if(sPackage->databuf[d] == 0) - { - //std::cout<< "in fbPush Sending 0 from f "<< f <databuf[d] != 0); - } - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() == graph->lMap[f]->l_con.scpy[i].sbx.bigEnd()) - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(0)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(1)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(2)==4) - // std::cout<< "Corner Push for f "<< f << " data0 " <databuf[0]<< " size " <bufSize << " se "<< graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() <worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 pQ enQ" <lMap[f]->l_con.nscpy; i++) - { - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQ enQ from f "<< f <lMap[f]->l_con.scpy[i].pQueue.enqueue( graph->lMap[f]->l_con.scpy[i].recycleQueue.dequeue(true),true ); - } - omp_unset_lock(&(graph->lMap[f]->l_con.sLock)); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - } // if(LAZY_PUSH) - else - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - omp_set_lock(graph->lMap[f]->r_con.sndLock); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1))==ntid) - { - //std::cout << "RQS " << graph->lMap[f]->r_con.snd[i].recycleQueue.queueSize() << std::endl; - - Package *sndPackage = graph->lMap[f]->r_con.snd[i].recycleQueue.dequeue(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->r_con.snd[i].sbx,0,nComp,sndPackage->databuf); - sndPackage->notified = false; - graph->lMap[f]->r_con.snd[i].pQueue.enqueue( sndPackage,true ); - //!the local message handler will detect the change and notify the remote message handler =>read access - //!the remote message handler first modifies the front item of this queue, then it push this item back to the message pool - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - omp_unset_lock(graph->lMap[f]->r_con.sndLock); - omp_set_lock(graph->sMap[f]->r_con.sndLock); - for(int i=0; ilMap[f]->r_con.nsnd; i++) - graph->sMap[f]->r_con.snd[i].pQueue.enqueue( graph->sMap[f]->r_con.snd[i].recycleQueue.dequeue(true),true ); - omp_unset_lock(graph->sMap[f]->r_con.sndLock); - } - -} // fillBoundaryPush - -void Perilla::fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - if(ntid==0) - omp_set_lock(&(graph->lMap[f]->l_con.dLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::LAZY_PUSH) - { } - else - { - if(perilla::UNPACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *dPackage = graph->lMap[f]->l_con.dcpy[i].pQueue.getFront(true); - - for(int d=0; dbufSize; d++) - if(dPackage->databuf[d] == 0) - { - //std::cout<< "in fbPull Reciving 0 for f "<< f <databuf[d] != 0); - } - /* - if(f==0) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd() == graph->lMap[f]->l_con.dcpy[i].dbx.bigEnd()) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(0)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(1)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(2)==4) - std::cout<< "Corner Pull for f "<< f << " data0 " <databuf[0]<< " size " <bufSize <<" se " <lMap[f]->l_con.dcpy[i].dbx.smallEnd()<m_fabs_v[f]->copyFromMem(graph->lMap[f]->l_con.dcpy[i].dbx,0,nComp,dPackage->databuf); - } - } // if(UNPACKING_FINEGRAIN) - else - } // if(LAZY_PUSH) - else - - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - graph->lMap[f]->l_con.dcpy[i].recycleQueue.enqueue( graph->lMap[f]->l_con.dcpy[i].pQueue.dequeue(true),true ); - - graph->lMap[f]->l_con.firingRuleCnt = graph->lMap[f]->l_con.firingRuleCnt - graph->lMap[f]->l_con.ndcpy; - - graph->lMap[f]->l_con.scpyCnt = 0; - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if(graph->lMap[f]->l_con.dcpy[i].pQueue.queueSize(true) >= 1) - graph->lMap[f]->l_con.firingRuleCnt++; - omp_unset_lock(&(graph->lMap[f]->l_con.dLock)); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - { - omp_set_lock(graph->rMap[f]->r_con.rcvLock); - omp_set_lock(graph->lMap[f]->r_con.rcvLock); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *rcvMetaPackage = graph->rMap[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graph->rMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - Package *rcvPackage = graph->lMap[f]->r_con.rcv[i].pQueue.dequeue(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->r_con.rcv[i].dbx,0,nComp,rcvPackage->databuf); - rcvPackage->completed = false; - rcvPackage->notified = false; - graph->lMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - graph->lMap[f]->r_con.firingRuleCnt = graph->lMap[f]->r_con.firingRuleCnt - graph->lMap[f]->r_con.nrcv; - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if(graph->lMap[f]->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(graph->lMap[f]->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - graph->lMap[f]->r_con.firingRuleCnt++; - omp_unset_lock(graph->lMap[f]->r_con.rcvLock); - omp_unset_lock(graph->rMap[f]->r_con.rcvLock); - } - -} // fillBoundaryPull - -void Perilla::fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT) -{ -exit(0); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -void Perilla::multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& mfDst, const MultiFab& mfSrc, int nc, int ng, int ngSrc, const Periodicity& period) -{ - // MultiFab* mfSrc = gSrc->assocMF; - // MultiFab* mfDst = gDst->assocMF; - int myProc = ParallelDescriptor::MyProc(); - int np = ParallelDescriptor::NProcs(); - - try{ - - if(true)//if(!(*mfSrc == *mfDst)) - { - if(ng > mfDst.nGrow()) cout <<"MULTIFAB_COPY_C: ng > mfDst.nGrow not supported in parallel copy"<< endl; - if(ngSrc > mfSrc.nGrow()) cout <<"MULTIFAB_COPY_C: ngSrc > mfSrc.nGrow"<< endl; - if(ngSrc > 0) - { - - // To be implemented - //do i = 1, nboxes(msrc%la) - // call push_back(bl, grow(box_nodalize(get_box(msrc%la,i),msrc%nodal),lngsrc)) - //end do - //call build(batmp, bl, sort = .false.) - //call destroy(bl) - //call build(lasrctmp, batmp, boxarray_bbox(batmp), explicit_mapping = get_proc(msrc%la)) - //call destroy(batmp) - //call build(msrctmp, lasrctmp, nc = lnc, ng = 0) - //pmfsrc => msrctmp - } - if(np > 1) - { - if(gSrc->sCopyMapHead == 0) - gSrc->sCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gSrc->sCopyMapHead; - gSrc->sCopyMapHead = tmpCopyMap; - } - if(gDst->rCopyMapHead == 0) - gDst->rCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gDst->rCopyMapHead; - gDst->rCopyMapHead = tmpCopyMap; - } - //gSrc->sCopyMapHead->map.reserve(mfSrc.size()); - //gDst->rCopyMapHead->map.reserve(mfDst.size()); - gSrc->sCopyMapHead->alloc_CopyMap(mfSrc); - gDst->rCopyMapHead->alloc_CopyMap(mfDst); - } - - if(gSrc->numTasks != mfSrc.IndexArray().size()) - std::cout<< "before " <numTasks << " now " <graphID << std::endl; - - gSrc->numFabs = mfSrc.size(); - gDst->numFabs = mfDst.size(); - - gSrc->numTasks = mfSrc.IndexArray().size(); - gDst->numTasks = mfDst.IndexArray().size(); - - int nfabsSrc = mfSrc.IndexArray().size(); - int nfabsDst = mfDst.IndexArray().size(); - - const FabArrayBase::CPC& TheCPC = mfDst.getCPC(IntVect(ng), mfSrc, IntVect(ngSrc), period); - - const int nloc_cpAsc = TheCPC.m_LocTags->size(); - const int nsnds_cpAsc = TheCPC.m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC.m_RcvTags->size(); - - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(nsnds_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_SndTags->begin(), - m_End = TheCPC.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - // std::cout<< "Loop 1" < recv_cctc; - Vector recv_pr; - recv_cctc.reserve(nrcvs_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_RcvTags->begin(), - m_End = TheCPC.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - - //std::cout<< "Before parallel at gID " << gDst->graphID << " numTask " << gDst->numTasks << " numFabs " << gDst->numFabs <graphID > 25) - //std::cout<< "Inside parallel Generating Send at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead == 0) - { - gSrc->task[f]->cpAsc_srcHead = new FabCopyAssoc(); - cpSrc = gSrc->task[f]->cpAsc_srcHead; - } - else - { - cpSrc = new FabCopyAssoc(); - cpSrc->next = gSrc->task[f]->cpAsc_srcHead; - gSrc->task[f]->cpAsc_srcHead = cpSrc; - } - - cpSrc->graphPartner = gDst; - cpSrc->l_con.nscpy = 0; - for(int i=0; il_con.nscpy++; - } - cpSrc->l_con.scpy = new LocalCopyDescriptor[cpSrc->l_con.nscpy]; - int scnt = 0; - - //if(gDst->graphID == 4 && tag.dstIndex == 60 ) - //std::cout<< "Inside parallel Generating Local Copy send at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59) ) - //std::cout <<"myP " <l_con.scpy[scnt].ns = mfSrc.localindex(tag.srcIndex); - cpSrc->l_con.scpy[scnt].nd = mfDst.localindex(tag.dstIndex); - cpSrc->l_con.scpy[scnt].sbx = tag.sbox; - cpSrc->l_con.scpy[scnt].dbx = tag.dbox; - int psize = tag.sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - //std::cout<< " gSrc ID "<< gSrc->graphID << " f "<databuf[j] = 0; - cpSrc->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.scpy[scnt].recycleQueue.enqueue(cpSrc->l_con.scpy[scnt].pQueue.dequeue()); - scnt++; - } - } - - if(np > 1) - { - cpSrc->r_con.nsnd = 0; - cpSrc->r_con.remotePushReady = false; - cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - cpSrc->r_con.nsnd++; - } - } // for(ir_con.snd = new RemoteCommDescriptor[cpSrc->r_con.nsnd]; - scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 17 && (it->srcIndex == 1198 || it->srcIndex == 1198 || it->srcIndex == 978 || it->srcIndex == 978)) - //std::cout <<"myP " <dstIndex << " s "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <r_con.snd[scnt].ns = it->srcIndex; - cpSrc->r_con.snd[scnt].nd = it->dstIndex; - cpSrc->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - cpSrc->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - cpSrc->r_con.snd[scnt].sbx = it->sbox; - cpSrc->r_con.snd[scnt].dbx = it->dbox; - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - for(int p=0; pdatabuf[j] = 0; - cpSrc->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.snd[scnt].recycleQueue.enqueue(cpSrc->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // if(fg==tg) -#pragma omp barrier - // std::cout<< "Barrier 1" < 1) - { - if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - { - - // std::cout<< "Inside parallel Generating Remote Send tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <sCopyMapHead->map[f]->r_con.nsnd = 0; - gSrc->sCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - gSrc->sCopyMapHead->map[f]->r_con.nsnd++; - } - } // for(isCopyMapHead->map[f]->r_con.snd = new RemoteCommDescriptor[gSrc->sCopyMapHead->map[f]->r_con.nsnd]; - int scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 31 && (it->dstIndex == 519)) - //std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <sCopyMapHead->map[f]->r_con.snd[scnt].ns = it->srcIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].nd = it->dstIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_gid = gDst->graphID-1; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sbx = it->sbox; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].dbx = it->dbox; - - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sz = psize; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pr = send_pr[i]; - - for(int p=0; pdatabuf[j] = 0; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; psCopyMapHead->map[f]->r_con.snd[scnt].recycleQueue.enqueue(gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead == 0) - { - gDst->task[f]->cpAsc_dstHead = new FabCopyAssoc(); - cpDst = gDst->task[f]->cpAsc_dstHead; - } - else - { - cpDst = new FabCopyAssoc(); - cpDst->next = gDst->task[f]->cpAsc_dstHead; - gDst->task[f]->cpAsc_dstHead = cpDst; - } - cpDst->graphPartner = gSrc; - cpDst->l_con.ndcpy = 0; - cpDst->l_con.firingRuleCnt = 0; - cpDst->l_con.dcpyCnt = 0; - for(int i=0; il_con.ndcpy++; - } - cpDst->l_con.dcpy = new LocalCopyDescriptor[cpDst->l_con.ndcpy]; - int dcnt = 0; - - //if(gDst->graphID > 25) - //std::cout<< "Inside parallel Generating Local copy recive at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID ==27 && f == 633) - //std::cout<< "tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc << " ndcpy " << cpDst->l_con.ndcpy <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59)) - //std::cout<< "dcpy tid " << tid << " f " << f << " i " << i << " dcnt " << dcnt << " ns "<l_con.dcpy[dcnt].ns = mfSrc.localindex(tag.srcIndex); - cpDst->l_con.dcpy[dcnt].nd = mfDst.localindex(tag.dstIndex); - cpDst->l_con.dcpy[dcnt].sbx = tag.sbox; - cpDst->l_con.dcpy[dcnt].dbx = tag.dbox; - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating Package tid " << tid << " i " << i <l_con.dcpy[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating now in reQ Package tid " << tid << " i " << i <l_con.dcpy[dcnt].recycleQueue.enqueue(cpDst->l_con.dcpy[dcnt].pQueue.dequeue()); - - //if(gDst->graphID > 25 && f == 633) - // std::cout<< " Generated Package tid " << tid << " i " << i <graphID > 25 && f > 630) - //std::cout<< "Safe now tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc <srcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; il_con.ndcpy ; i++) - { - for(int p=0; pl_con.dcpy[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->l_con.dcpy[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.dcpy[i].recycleQueue.enqueue(cpdDst->l_con.dcpy[i].pQueue.dequeue()); - } - } - - if(np > 1) - { - cpDst->r_con.nrcv = 0; - cpDst->r_con.remotePullDone = false; - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - cpDst->r_con.nrcv++; - } - } // for(ir_con.rcv = new RemoteCommDescriptor[cpDst->r_con.nrcv]; - dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - cpDst->r_con.rcv[dcnt].nd = it->dstIndex; - cpDst->r_con.rcv[dcnt].ns = it->srcIndex; - cpDst->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - cpDst->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - cpDst->r_con.rcv[dcnt].sbx = it->sbox; - cpDst->r_con.rcv[dcnt].dbx = it->dbox; - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - cpDst->r_con.rcv[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[dcnt].recycleQueue.enqueue(cpDst->r_con.rcv[dcnt].pQueue.dequeue()); - } - - dcnt++; - } - } - }// for(isrcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; ir_con.nrcv ; i++) - { - for(int p=0; pr_con.rcv[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[i].recycleQueue.enqueue(cpdDst->r_con.rcv[i].pQueue.dequeue()); - } - } - - - } // if(np > 1) - }// if(fg==tg) - -#pragma omp barrier - if(np > 1) - { - //if(tid==0) - if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - { - - // std::cout<< "Inside parallel Generating Remote Recive tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <rCopyMapHead->map[f]->r_con.nrcv = 0; - gDst->rCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - gDst->rCopyMapHead->map[f]->r_con.nrcv++; - } - } - gDst->rCopyMapHead->map[f]->r_con.rcv = new RemoteCommDescriptor[gDst->rCopyMapHead->map[f]->r_con.nrcv]; - int dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - - // if(myProc==54 && gDst->graphID == 25 && f == 10) - // std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " sgID "<< gSrc->graphID <<" tg "<rCopyMapHead->map[f]->r_con.rcv[dcnt].nd = it->dstIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].ns = it->srcIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sbx = it->sbox; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].dbx = it->dbox; - - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sz = psize; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pr = recv_pr[i]; - - BL_ASSERT(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd == f); - - if(Perilla::genTags) - { - try{ - std::map::iterator itr = tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].find(psize); - if( itr != tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].end()) - { - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex][psize] = Perilla::uTags++; - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[recv_pr[i]].find(gDst->graphID-1); - if(itr2 != pTagCnt[recv_pr[i]].end()) - pTagCnt[recv_pr[i]][gDst->graphID-1] = pTagCnt[recv_pr[i]][gDst->graphID-1] + 1; - else - pTagCnt[recv_pr[i]][gDst->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< gDst->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[recv_pr[i]][gDst->graphID][it->dstIndex][it->srcIndex] = pTagCnt[recv_pr[i]]; - - - for(int p=0; pdatabuf[j] = 0; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; prCopyMapHead->map[f]->r_con.rcv[dcnt].recycleQueue.enqueue(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.dequeue()); - dcnt++; - } - } - } // for(i 1) - //} //if(fg==tg) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Send partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead->l_con.nscpy; i++) - { - int lnd = gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].nd; - for(int j=0; jtask[ lnd ]->cpAsc_dstHead->l_con.ndcpy; j++) - if(gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dbx == gDst->task[ lnd ]->cpAsc_dstHead->l_con.dcpy[j].dbx) - gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dPartner = j; - } - } - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead->l_con.ndcpy; i++) - { - int lns = gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].ns; - for(int j=0; jtask[ lns ]->cpAsc_srcHead->l_con.nscpy; j++) - if(gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].dbx == gSrc->task[ lns ]->cpAsc_srcHead->l_con.scpy[j].dbx) - gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].sPartner = j; - } - } - } // for(fgraphID <<" "<< e.what() << '\n'; -} - - -//std::cout<< "All done safely at gID " << gDst->graphID <assocMF; - // MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - //if(srcGraph->graphID==18 && f ==316 && ntid == 0) - //std::cout << "srgG chk see " << srcGraph << " " <graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - omp_set_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - omp_unset_lock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - omp_set_lock(&(cpSrc->l_con.sLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - for(int i=0; il_con.nscpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - omp_unset_lock(&(cpSrc->l_con.sLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - //if(myProc==26 && srcGraph->graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && f ==316) - //BL_ASSERT(cpSrc->r_con.nsnd == 177); - - if(singleT) - { - omp_set_lock(cpSrc->r_con.sndLock); - for(int i=0; ir_con.nsnd; i++) - { - - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->notified = false; - sndPackage->notified = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - omp_unset_lock(cpSrc->r_con.sndLock); - - cpSrc->r_con.remotePushReady = true; - ///* - omp_set_lock(srcGraph->sCopyMapHead->map[f]->r_con.sndLock); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - omp_unset_lock(srcGraph->sCopyMapHead->map[f]->r_con.sndLock); - } - else - { - if(ntid == 0) - omp_set_lock(cpSrc->r_con.sndLock); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - - // if(myProc==4 && srcGraph->graphID==2 && (f ==0 || f ==2)) - //std::cout << " Pushing 2 316 164"<r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->notified = false; - sndPackage->notified = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - - } - - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - omp_unset_lock(cpSrc->r_con.sndLock); - - cpSrc->r_con.remotePushReady = true; - ///* - omp_set_lock(srcGraph->sCopyMapHead->map[f]->r_con.sndLock); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - omp_unset_lock(srcGraph->sCopyMapHead->map[f]->r_con.sndLock); - //*/ - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPushAsync - -void Perilla::multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, bool singleT) -{ - multifabCopyPushAsync(destGraph, srcGraph, mfDst, mfSrc, f, 1, 1, 1, 0, 0, singleT); -} - -void Perilla::multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - multifabCopyPush_1Team(destGraph,srcGraph,mfDst,mfSrc,f,dstcomp,srccomp,nc,ng,ngsrc,singleT); - if(!singleT) - srcGraph->worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); -} - - void Perilla::multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) - { - int ntid = perilla::wtid();// - perilla::NUM_COMM_THREADS; - int tg = perilla::wid(); - int myProc = amrex::ParallelDescriptor::MyProc(); - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - while(cpSrc != 0) - { - if(cpSrc->graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - omp_set_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true)); - omp_unset_lock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - omp_set_lock(&(cpSrc->l_con.sLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - for(int i=0; il_con.nscpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - /* - for(int ii=0; ii < sndPackage->bufSize; ii++) - if(sndPackage->databuf[ii] == 0) - fout << "MFCPush loc zero at " << f << " i " << i << " ii " << ii << " sbx "<< cpSrc->l_con.scpy[i].sbx << std::endl; - */ - } - - //fout.close(); - - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(ntid==0) - { - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true)); - omp_unset_lock(&(cpSrc->l_con.sLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - - int np = amrex::ParallelDescriptor::NProcs(); - if(np == 1) - return; - if(singleT) - { - omp_set_lock((cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->notified = false; - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage, true); - } - cpSrc->r_con.remotePushReady = true; - omp_unset_lock((cpSrc->r_con.sndLock)); - } - else - { - if(ntid == 0) - { - omp_set_lock((cpSrc->r_con.sndLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - for(int i=0; ir_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - sndPackage->notified = false; - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage, true); - } - - //fout.close(); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(ntid==0) - { - cpSrc->r_con.remotePushReady = true; -/* - for(int i=0; ir_con.nsnd; i++){ - Package* sndPackage = srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true); - sndPackage->served = false; - sndPackage->completed = false; - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(sndPackage, true); - } -*/ - omp_unset_lock((cpSrc->r_con.sndLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - } // if(!(*mfDst == *mfSrc)) - } // multifabCopyPush - - -void Perilla::multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, bool singleT) - { - multifabCopyPush(destGraph, srcGraph, mfDst, mfSrc, f, 1, 1, 1, 0, 0, singleT); - } - -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int myProc = ParallelDescriptor::MyProc(); - - int ntid = WorkerThread::perilla_wtid(); - int tg = WorkerThread::perilla_wid(); - //MultiFab* mfDst = destGraph->assocMF; - //MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - //if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - //if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpDst = destGraph->task[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == srcGraph) - break; - cpDst = cpDst->next; - } - if(cpDst == 0) cout <<"Metadata for across grid copy not found"<< endl; - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(singleT) - { - omp_set_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - omp_unset_lock(&(cpDst->l_con.dLock)); - } - else - { - if(ntid==0) - omp_set_lock(&(cpDst->l_con.dLock)); - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; il_con.ndcpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(ntid == 0) - { - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - omp_unset_lock(&(cpDst->l_con.dLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - omp_set_lock(destGraph->rCopyMapHead->map[f]->r_con.rcvLock); - omp_set_lock(cpDst->r_con.rcvLock); - for(int i=0; ir_con.nrcv; i++) - { - ///* - Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->notified = false; - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - omp_unset_lock(cpDst->r_con.rcvLock); - omp_unset_lock(destGraph->rCopyMapHead->map[f]->r_con.rcvLock); - - } - else - { - if(ntid==0) - { - omp_set_lock(destGraph->rCopyMapHead->map[f]->r_con.rcvLock); - omp_set_lock(cpDst->r_con.rcvLock); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nrcv; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - ///* - Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->notified = false; - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(ntid==0) - { - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - omp_unset_lock(cpDst->r_con.rcvLock); - omp_unset_lock(destGraph->rCopyMapHead->map[f]->r_con.rcvLock); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) - -} // multifabCopyPull - -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, bool singleT) -{ - multifabCopyPull(destGraph, srcGraph, mfDst, mfSrc, f, 1, 1, 1, 0, 0,singleT); -} - -void Perilla::serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg) -{ - int nfabs = graphArray[g]->numTasks; - - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - //std::cout<<" "<l_con.sLock)); - if(lockSucceeded != 0) - { - for(int i=0; il_con.nscpy; i++) - { - if(cpSrc->l_con.scpy[i].pQueue.queueSize()>0) - { -assert(doublechecked==false); - FabCopyAssoc* cpDst = cpSrc->graphPartner->task[cpSrc->l_con.scpy[i].nd]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == graphArray[g]) - break; - cpDst = cpDst->next; - } - Package* sPackage = cpSrc->l_con.scpy[i].pQueue.dequeue(true); - omp_set_lock(&(cpDst->l_con.dLock)); - int dPartner = cpSrc->l_con.scpy[i].dPartner; - Package* dPackage = cpDst->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - /* - for(int j=0; jbufSize; j++) - { - dPackage->databuf[j] = sPackage->databuf[j]; - } - */ - std::memcpy(dPackage->databuf, sPackage->databuf, dPackage->bufSize * sizeof(double)); - //std::swap(dPackage->databuf, sPackage->databuf); - - cpDst->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(cpDst->l_con.dcpy[dPartner].pQueue.queueSize(true) == 1) - cpDst->l_con.firingRuleCnt++; - omp_unset_lock(&(cpDst->l_con.dLock)); - cpSrc->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - } - } // for - omp_unset_lock(&(cpSrc->l_con.sLock)); - } // if(lockSucceeded) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // if(tg==fg) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - bool nextsReq, nextrReq; - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - //MultiFab* mf = graphArray[g]->assocMF; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(omp_test_lock(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock) != 0) - { - if(omp_test_lock(cpDst->r_con.rcvLock) != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - { - nextsReq = true; - } - else - { - Package *rearPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - // Also check the recycle queue because when rear is completed it may cause unlimited recv posts - if(rearPackage->completed && graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.queueSize() > 1) //!latest receive request has been completed - { - nextsReq = true; - } - else //!expected message is still on the way - nextsReq = false; - } - if(nextsReq) //!take a message from recycle pool and post a receive - { - //!create a package to keep track of receive requests - - Package *rMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - int ns = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].ns; - int nd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].nd; - int lnd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int r_grids = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].r_grids; - Package *rPackage = cpDst->r_con.rcv[i].recycleQueue.dequeue(true); - //int tag = tagGen(ns, nd, graphID-1, np*r_grids, nGraphs); - //int tag = Perilla::myTagMap[graphID-1][nd][ns]; - //int tag = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int tag = tagMap[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][g][nd][ns][graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz]; - - // if(graphArray[g]->graphID == 25 && lnd==10 && myProc==54) - //std::cout << "R Posted g " << g << " myP " << myProc << " lnd " << lnd <<" nd "<< nd << " ns "<rCopyMapHead->map[f]->r_con.rcv[i].pr << std::endl; - - rMetaPackage->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - rMetaPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - - } - } // for (ir_con.nrcv) - omp_unset_lock(cpDst->r_con.rcvLock); - } // if(ga locked) - omp_unset_lock(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - //if(g == 17 && f == 316 && i == 164) - //std::cout << "Comm Thread nsnd "<< cpSrc->r_con.nsnd << " " << graphArray[g]<< std::endl; - if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - nextrReq = false; - else - nextrReq = true; - - if(nextrReq) //!take a message from recycle pool and post a receive - { - - Package *sMetaPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - if(!sMetaPackage->served) - { - Package *sPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - sMetaPackage->completed = false; - sMetaPackage->served = true; - sMetaPackage->request = MPI_REQUEST_NULL; - int ns = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].ns; - int nd = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].nd; - int r_gid = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_gid; - int r_grids = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_grids; - //int tag = tagGen(ns, nd, r_gid-1, np*r_grids, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz]; - sMetaPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - //if(g == 31 && nd == 519 ) - //std::cout << "S Posted r_g " << r_gid << " atP " << myProc << " nd "<< nd << " ns "<sCopyMapHead->map[f]->r_con.snd[i].pr << std::endl; - - } - } - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - { - if(omp_test_lock(cpDst->r_con.rcvLock) != 0) - { - Package *rearPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear(true); - if(!rearPackage->completed) - { - bool flag = false; - int ret_flag=0; - MPI_Status status; - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - - flag = (ret_flag == 0) ? false : true;//parallel_test_one(rearPackage%ptr%request) -------??????? - if(flag) - { - rearPackage->completeRequest(); - cpDst->r_con.rcv[i].pQueue.getRear()->completeRequest(); - - if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - { - //if(graphArray[g]->graphID == 25 && f==0 && myProc==1) - //std::cout<<"Recieved fc++ for f " << f << " fc " << cpDst->r_con.firingRuleCnt <r_con.firingRuleCnt++; - } -#pragma omp flush - } - } - omp_unset_lock(cpDst->r_con.rcvLock); - } // if(ga locked) - } // if(pQueue.queueSize(true) > 0) - } // for (ir_con.nrcv) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - { - Package *frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served && !frontPackage->completed) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(frontPackage%ptr%request) -------??????? - if(flag) - { - omp_set_lock(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock); - frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - frontPackage->notified = false; - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - omp_unset_lock(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock); -#pragma omp flush - omp_set_lock(cpSrc->r_con.sndLock); - frontPackage = cpSrc->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - cpSrc->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - omp_unset_lock(cpSrc->r_con.sndLock); - } - } - } // if(queueSize > 0) - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - //MultiFab* mf = graphArray[g]->assocMF; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - if(cpSrc->r_con.remotePushReady) - { - omp_set_lock(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock); - for(int i=0; ir_con.nsnd; i++) - { - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - } - omp_unset_lock(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock); - cpSrc->r_con.remotePushReady = false; - }// if remotepushready - cpSrc = cpSrc->next; - } - }// ismyRegion - }//for ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(omp_test_lock(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock) != 0) - { - if(omp_test_lock(cpDst->r_con.rcvLock) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->notified = false; - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - 1; - - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpDst->r_con.remotePullDone = false; - - //if(f==1 && g==26 && myProc == 54) - // std::cout<<"Completed Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.rcvLock); - } // if(ga locked) - omp_unset_lock(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - /* - if(false) - for(int id=0; idtask[f]->depTaskIDs.size(); id++) - { - int df = graphArray[g]->task[f]->depTaskIDs[id]; - if(WorkerThread::isMyRegion(0,df)) - { - int lgID = graphArray[g]->srcLinkGraph->graphID-1; - - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Dep Push f " << df << " gID " << lgID+1 << " myP " << myProc <task[df]->cpAsc_dstHead; - while(cpdDst != 0) - { - if(omp_test_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock) != 0) - { - if(omp_test_lock(cpdDst->r_con.rcvLock) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpdDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->notified = false; - rcvPackage->completed = false; - cpdDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpdDst->r_con.firingRuleCnt = cpdDst->r_con.firingRuleCnt - 1; - - if(cpdDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpdDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpdDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpdDst->r_con.remotePullDone = false; - - //if(df==10 && lgID==24 && myProc == 54) - // std::cout<<"Completed Push f " << df << " gID " << lgID+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.rcvLock); - } // if(ga locked) - omp_unset_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock); - } // if(mf locked) - cpdDst = cpdDst->next; - } // while(cpdDst != 0) - - - } // if tg==0 region - - - } // for all dependents - */ - - - - } - } // for(f -#include - - -namespace perilla{ - -struct _workerThreadInfo{ - int _tid; //thread id in local group - int _size; //number of threads in the group -}; - -struct _threadInfo{ - bool _isComm; //whether this thread handles communication - int _wtid; //worker thread id (-1 if this thread is decicated to communication) - int _nWts; //number of thread groups -}; - -class RTS -{ - private: - int _nWrks; - void RTS_Init(); - int _rank, _nProcs; - - public: - RTS(){ - _nWrks=1; - char* nWrks= getenv("NWORKERS"); - if(nWrks) _nWrks= atoi(nWrks); - } - RTS(int nWrks):_nWrks(nWrks){} - int ProcCount(); - int MyProc(); - int WorkerThreadCount(); - int MyWorkerThread(); - void Init(); //Build the runtime system from scratch - void Init(int rank, int nProcs);//Build the runtime system on pre-existing MPI processes - void Iterate(void *graph, int max_step, Real stop_time); - void Finalize(); -// double Time(); - void Barrier(); - void runAMR(Amr* amrptr, int max_step, Real stop_time); -}; - -} - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/PerillaRts.cpp b/Src/AmrTask/rts_impls/mpi_omp/PerillaRts.cpp deleted file mode 100644 index 62048ad4210..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/PerillaRts.cpp +++ /dev/null @@ -1,111 +0,0 @@ -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//ompodification 08-14-2017 -#include -#include -#include -#include -#include -#include "PerillaRts.H" - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include -PerillaMemCheck memcheck; -#endif - -#include -#include -using namespace std; -#include - -namespace perilla{ - Amr* amrptr; - - int RTS::ProcCount(){ - return _nProcs; - } - - int RTS::MyProc(){ - return _rank; - } - - int RTS::WorkerThreadCount(){ - return _nWrks; - } - - int RTS::MyWorkerThread(){ - return 0; - } - - void RTS::runAMR(Amr* amr, int max_step, Real stop_time){ - while ( amr->okToContinue() && - (amr->levelSteps(0) < max_step || max_step < 0) && - (amr->cumTime() < stop_time || stop_time < 0.0) ) - - { - // Do a coarse timestep, which calls one or multiple timestep updates (i.e. timeStep()) at each AMR level - amr->coarseTimeStep(stop_time); - } - } - - void InitializeMPI(){ - int provided; - MPI_Init_thread(0, 0, MPI_THREAD_FUNNELED, &provided); - if(provided == MPI_THREAD_SINGLE){//with this MPI, process can't spawn threads - cerr << "Spawning threads is not allowed by the MPI implementation" << std::endl;; - } - } - - void RTS::RTS_Init(){ - amrptr= NULL; - } - - void RTS::Init(){ - InitializeMPI(); - MPI_Comm_rank(MPI_COMM_WORLD, &_rank); - MPI_Comm_size(MPI_COMM_WORLD, &_nProcs); - RTS_Init(); - } - - void RTS::Init(int rank, int nProcs){ - _rank= rank; - _nProcs= nProcs; - RTS_Init(); - } - - void RTS::Finalize(){ -#ifdef PERILLA_DEBUG - memcheck.report(); -#endif - } - - void RTS::Iterate(void* amrGraph, int max_step, Real stop_time){ - Perilla::max_step=max_step; - assert(amrGraph); - amrptr= (Amr*)amrGraph; - runAMR(amrptr, max_step, stop_time); - } - -#if 0 - const double kMicro = 1.0e-6; - double RTS::Time() - { - struct timeval TV; - - const int RC = gettimeofday(&TV, NULL); - if(RC == -1) - { - printf("ERROR: Bad call to gettimeofday\n"); - return(-1); - } - return( ((double)TV.tv_sec) + kMicro * ((double)TV.tv_usec) ); - } -#endif - - void RTS::Barrier(){ - //nothing - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/mpi_omp/RGIter.H b/Src/AmrTask/rts_impls/mpi_omp/RGIter.H deleted file mode 100755 index 8141fdc1300..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RGIter.H +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef RGITER_H_ -#define RGITER_H_ - -#include -//#include -//#include -#include - -namespace amrex{ - - class AsyncFillPatchIterator; - - class RGIter - { - public: - int tid; - int ntid; - int tg; - int currentRegion; - int currentTile; - int totalItr; - int currentItr; - bool tiling; - bool implicit; - bool ppteams; - bool haveDepGraph; - RegionGraph* itrGraph; - RegionGraph* depGraph; - int boxGrow, index, scomp, ncomp, iteration; - double time; - double getFireableTime; - amrex::MultiFab *_dest; - - IndexType typ; - - Vector m_level_afpi; - Vector m_upper_level_afpi; - std::ofstream fout; - - public: - RGIter(RegionGraph* rg, bool enableAllTasks=false); - RGIter(RegionGraph* rg, RegionGraph* drg, bool isDep=true); - RGIter(amrex::AsyncFillPatchIterator* afpi, bool enableAllTasks=false); - RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr); - ~RGIter(); - - void init(); - void sync_workers(); - //! Increment iterator to the next tile we own. - void operator++ (); - //! Is the iterator valid, are more regions to iterate over? - bool isValid(); - int LocalIndex() const { return currentRegion; } - void exec(); - - amrex::Box tileBox(); - amrex::Box validBox() const; - amrex::Box tilebox(); - amrex::Box growntilebox(); - amrex::Box growntilebox(int ng); - amrex::Box nodaltilebox(int dir); - }; -} -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/RGIter.cpp b/Src/AmrTask/rts_impls/mpi_omp/RGIter.cpp deleted file mode 100755 index 74ffe274bb6..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RGIter.cpp +++ /dev/null @@ -1,615 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -using namespace perilla; -#include -#include "RGIter.H" - -namespace amrex{ - - RGIter::RGIter(RegionGraph* rg, bool enableAllTasks): - itrGraph(rg), - implicit(false), - ppteams(true), - //typ(rg->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - itrGraph->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(enableAllTasks) - itrGraph->enableAllRegions(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - init(); - } - - RGIter::RGIter(RegionGraph* rg, RegionGraph* drg, bool isDep): - itrGraph(rg), - implicit(false), - ppteams(true), - //typ(rg->typ), - haveDepGraph(isDep), - depGraph(drg), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) itrGraph->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - init(); - } - - RGIter::RGIter(amrex::AsyncFillPatchIterator* afpi, bool enableAllTasks): - itrGraph(afpi->destGraph), - implicit(false), - ppteams(true), - //typ(afpi->destGraph->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - afpi->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(enableAllTasks) - itrGraph->enableAllRegions(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - init(); - } - -#ifndef USE_PERILLA_ON_DEMAND - RGIter::RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr): - itrGraph(afpi[itr-1]->destGraph), - m_level_afpi(afpi), - m_upper_level_afpi(upper_afpi), - boxGrow(bG), - time(tm), - index(ind), - scomp(sc), - ncomp(nc), - iteration(itr), - implicit(true), - ppteams(true), - //typ(afpi[itr-1]->destGraph->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - bool push = true; - - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - m_level_afpi[iteration-1]->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - if(ntid == perilla::NUM_THREADS_PER_TEAM-2) - { - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; - - //////////////////////////////////////Push Pull Thread Start///////////////////////// - while(m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->queueSize(true) != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks || - m_level_afpi[iteration-1]->destGraph->worker[tg]->computedTasks != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks) - { - f = m_level_afpi[iteration-1]->destGraph->getFireableRegion(tg); - if(f != -1) - { - m_level_afpi[iteration-1]->Receive(this,dest,boxGrow,time,index,scomp,ncomp,f,true); - m_level_afpi[iteration-1]->destGraph->setFireableRegion(f); - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->unfireableRegionQueue->queueSize(true) !=0 && - m_level_afpi[iteration-1]->destGraph->worker[tg]->fireableRegionQueue->queueSize(true) < 2) - continue; - } - - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->queueSize() != 0) - { - f = m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->removeRegion(); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(*(this),boxGrow,time+dt,index,scomp,ncomp,iteration,f,true); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,true); - } - } - m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } - //fout.close(); - ////////////////////////////////////////////////////////Push Pull Thread End//////////////////// - } - else - { - //fout << "Calling init "<< std::endl; - //fout.close(); - init(); - } - } - -#else - - RGIter::RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr): - itrGraph(afpi[itr-1]->destGraph), - m_level_afpi(afpi), - m_upper_level_afpi(upper_afpi), - _dest(&dest), - boxGrow(bG), - time(tm), - index(ind), - scomp(sc), - ncomp(nc), - iteration(itr), - implicit(true), - ppteams(true), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - bool push = true; - - int tid = perilla::tid(); - int tg = perilla::wid(); - int ntid = perilla::wtid(); - - if(perilla::isCommunicationThread()) - { - std::vector flattenedGraphArray; - Perilla::flattenGraphHierarchy(m_level_afpi[iteration-1]->m_amrlevel.parent->graphArray, flattenedGraphArray); - while(true){ - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - break; - } - } - }else -{ - - AsyncFillPatchIterator::initialSend(m_level_afpi, m_upper_level_afpi, boxGrow, time, index, scomp, ncomp, iteration); - syncAllWorkerThreads(); - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - m_level_afpi[iteration-1]->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - if(ntid == perilla::NUM_THREADS_PER_TEAM-2) - { - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; - while(m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->queueSize(true) != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks || - m_level_afpi[iteration-1]->destGraph->worker[tg]->computedTasks != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks) - { - f = m_level_afpi[iteration-1]->destGraph->getFireableRegion(tg); - if(f != -1) - { - m_level_afpi[iteration-1]->Receive(this,*_dest,boxGrow,time,index,scomp,ncomp,f,true); - m_level_afpi[iteration-1]->destGraph->setFireableRegion(f); - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->unfireableRegionQueue->queueSize(true) !=0 && - m_level_afpi[iteration-1]->destGraph->worker[tg]->fireableRegionQueue->queueSize(true) < 2) - continue; - } - - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->queueSize() != 0) - { - f = m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->removeRegion(); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(*(this),boxGrow,time+dt,index,scomp,ncomp,iteration,f,true); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,true); - } - } - m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } - } - else - { - //fout << "Calling init "<< std::endl; - //fout.close(); - init(); - } -} - - } - -#endif - using namespace perilla; - - RGIter::~RGIter() - { - //fout.close(); - } - - void RGIter::init() - { - if(itrGraph->fabTiles.size() == 0) - tiling = false; - else - tiling = true; - - int myProc = amrex::ParallelDescriptor::MyProc(); - if(implicit) - { - if(!itrGraph->isGraphEmptyV2()) - { - currentRegion = itrGraph->getPulledFireableRegion(); - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - else - { - } - } - else - { - if(!itrGraph->isGraphEmpty()) - { - if(haveDepGraph) - currentRegion = itrGraph->getAnyFireableRegion(*depGraph); - else - currentRegion = itrGraph->getAnyFireableRegion(); - - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - else - { - } - } - } - - //! Increment iterator to the next tile we own. - void RGIter::operator++ () - { - - currentItr++; - - if(tiling) - for( (currentTile == itrGraph->fabTiles[currentRegion]->numTiles ? currentTile : ++currentTile); currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - { - if(implicit) - { - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - else - { - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - } - - int myProc = amrex::ParallelDescriptor::MyProc(); - - if( currentItr > totalItr )//&& currentTile == itrGraph->fabTiles[currentRegion]->numTiles) - { - //if(WorkerThread::isTeamMasterThread(tid) ) - //fout << "++B GEmpty " << itrGraph->isGraphEmpty(tg) << std::endl; - - //fout << "++B CmpReg isGE " << (implicit? itrGraph->isGraphEmptyV2(tg) : itrGraph->isGraphEmpty(tg)) << " CompleteQ "<< itrGraph->worker[tg]->nompletedRegionQueue->queueSize(true) << " totTasks " << itrGraph->worker[tg]->totalTasks << " FireQ "<< itrGraph->worker[tg]->fireableRegionQueue->queueSize(true) << " UnfireQ "<< itrGraph->worker[tg]->unfireableRegionQueue->queueSize(true) << std::endl; - - if(implicit) - itrGraph->regionComputed(currentRegion); - else - itrGraph->finalizeRegion(currentRegion); - - //if(WorkerThread::isTeamMasterThread(tid) ) - //fout << "++A GEmpty " << itrGraph->isGraphEmpty(tg) << std::endl; - - //fout << "++A CmpReg isGE " << (implicit? itrGraph->isGraphEmptyV2(tg) : itrGraph->isGraphEmpty(tg)) << " CompleteQ "<< itrGraph->worker[tg]->completedRegionQueue->queueSize(true) << " totTasks " << itrGraph->worker[tg]->totalTasks << " FireQ "<< itrGraph->worker[tg]->fireableRegionQueue->queueSize(true) << " UnfireQ "<< itrGraph->worker[tg]->unfireableRegionQueue->queueSize(true) << std::endl; - - if(implicit) - { - if(!itrGraph->isGraphEmptyV2()) - { - currentRegion = itrGraph->getPulledFireableRegion(); - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - else - { - //fout << "Graph is Empty" << std::endl; - //currentRegion = 0; - //currentTile = 0; - } - } - else - { - if(!itrGraph->isGraphEmpty()) - { -// double start_time_wtime = omp_get_wtime(); - - if(haveDepGraph) - currentRegion = itrGraph->getAnyFireableRegion(*depGraph); - else - currentRegion = itrGraph->getAnyFireableRegion(); - -// double end_time_wtime = omp_get_wtime(); -// getFireableTime += end_time_wtime - start_time_wtime; - - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - } - } - - //fout << "++E Region " << currentRegion << " Tile " << currentTile << " numTile "<< itrGraph->fabTiles[currentRegion]->numTiles <<" tid " << tid << " myP " << myProc <isGraphEmptyV2(); - if(valid) - do_remaining = false; - } - - if(do_remaining) - { - bool push = false; - - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; - - while(!itrGraph->isGraphEmpty()) - { - f = itrGraph->worker[tg]->computedRegionQueue->getFrontRegion(true); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(this,boxGrow,time+dt,index,scomp,ncomp,iteration,f,false); - //else if(level == parent->finestLevel() && iteration == ncycle) - //SborderAFPI[0]->PushOnly(NUM_GROW, time+dt, State_Type, 0, NUM_STATE, f, tid, 0x02, 1); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,false); - //upperLevel.SborderAFPI[i]->PushOnly(NUM_GROW, time+(i*parent->dtLevel(level+1)), State_Type, 0, NUM_STATE, f, tid, tuc, tempf, false); - } - } - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - { - f = itrGraph->worker[tg]->computedRegionQueue->removeRegion(); - itrGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } - - - //m_level_afpi[iteration-1]->destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(perilla::isMasterWorkerThread()) - m_level_afpi[iteration-1]->completeRegionGraphs(); - valid = false; - } - } - else - { - if(itrGraph->isGraphEmpty()) - if(perilla::isMasterWorkerThread()) - { - itrGraph->finalizeRegionGraph(); - } - valid = !(itrGraph->isGraphEmpty()); - } - /* - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(!isV && tg==0 && myProc==0) - if(WorkerThread::isTeamMasterThread(tid)) - fout << " M " <worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - */ - - /* - fout << "isValid Ending " << !(itrGraph->isGraphEmpty(tg)) << " tid " << tid <graphID != -1) - //Perilla::getAnyFRTimeSplit[itrGraph->graphID-1] += getFireableTime; - //if(myProc == 150 && itrGraph->graphID != -1) - //{ - // std::cout << "gID " << itrGraph->graphID << " getFRTime " << getFireableTime << std::endl; - //} - } - - return valid; - } - - amrex::Box RGIter::tileBox() - { - - int myProc = amrex::ParallelDescriptor::MyProc(); - //fout.open(std::to_string(myProc)+ "_" + std::to_string(tid) + ".txt", std::fstream::app); - - //fout << "nTls " << itrGraph->fabTiles[currentRegion]->numTiles << " cT " << currentTile << std::endl; - - if(currentTile == itrGraph->fabTiles[currentRegion]->numTiles) - //if( (currentTile % (perilla::NUM_THREADS_PER_TEAM-1) != ntid-1) ) - { - //fout << "invalidBox " << std::endl; - //fout.close(); - return amrex::Box(); - } - else - { - //fout << "validBox tBxSize " << itrGraph->fabTiles[currentRegion]->tileBx.size() << std::endl; - //fout.close(); - return *(itrGraph->fabTiles[currentRegion]->tileBx[currentTile]); - } - } - - amrex::Box RGIter::validBox() const - { - return *(itrGraph->fabTiles[currentRegion]->validBx); - } - - amrex::Box RGIter::tilebox() - { - return this->tileBox(); - } - - amrex::Box RGIter::growntilebox() - { - - } - - amrex::Box RGIter::growntilebox(int ng) - { - - Box bx = this->tileBox(); - if(currentTile == itrGraph->fabTiles[currentRegion]->numTiles) - return bx; - - if (ng < -100) ng = 0; - const Box& vbx = validBox(); - for (int d=0; dtileBox(); - bx.convert(typ); - const Box& vbx = this->validBox(); - const IntVect& Big = vbx.bigEnd(); - int d0, d1; - if (dir < 0) { - d0 = 0; - d1 = BL_SPACEDIM-1; - } else { - d0 = d1 = dir; - } - for (int d=d0; d<=d1; ++d) { - if (typ.cellCentered(d)) { // validbox should also be cell-centered in d-direction. - bx.surroundingNodes(d); - if (bx.bigEnd(d) <= Big[d]) { - bx.growHi(d,-1); - } - } - } - return bx; - } - - void RGIter::sync_workers() - { - - if(implicit) - itrGraph->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); - else - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - } -} diff --git a/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.H b/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.H deleted file mode 100755 index 2683e7bec57..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.H +++ /dev/null @@ -1,324 +0,0 @@ -#ifndef P_REGIONGRAPH_H -#define P_REGIONGRAPH_H - -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif - -namespace amrex{ - - class RegionGraph; - - class FabCopyAssoc - { - public: - LocalConnection l_con; - RemoteConnection r_con; - FabCopyAssoc *next; - FabCopyAssoc *prev; - RegionGraph *graphPartner; - // CopyAssoc *orig_copyAssoc; - FabCopyAssoc() - { - next=0; - prev=0; - graphPartner=0; - } - ~FabCopyAssoc() - { - if(next) delete next; - } - }; - - class pTileArray - { - public: - int numTiles; - std::vector tileBx; - Box* validBx; - pTileArray(): numTiles(0), tileBx(0){} - ~pTileArray() - { - tileBx.clear(); - } - }; - - class Task - { - public: - std::vector state; - bool init; - FabCopyAssoc *cpAsc_srcHead; - FabCopyAssoc *cpAsc_dstHead; - - std::vector depTaskIDs; - int numDepTasksCompleted; - bool depTasksCompleted; - - Task() - { - state.reserve(16); - depTaskIDs.reserve(1024); - depTasksCompleted = true; - numDepTasksCompleted = 0; - cpAsc_srcHead=0; - cpAsc_dstHead=0; - } - - ~Task() - { - state.clear(); - depTaskIDs.clear(); - if(cpAsc_srcHead != 0) - delete[] cpAsc_srcHead; - if(cpAsc_dstHead != 0) - delete[] cpAsc_dstHead; - } - }; - - class Worker - { - public: - int totalTasks; - int computedTasks; - bool init; - Barrier* barr; - Barrier* l_barr; - RegionQueue* fireableRegionQueue; - RegionQueue* unfireableRegionQueue; - RegionQueue* computedRegionQueue; - RegionQueue* completedRegionQueue; - Worker():init(false), l_barr(0), barr(0), totalTasks(0){} - - ~Worker(){ - delete barr; - delete l_barr; - delete fireableRegionQueue; - delete unfireableRegionQueue; - delete computedRegionQueue; - delete completedRegionQueue; - } - }; - - //template - class CopyMap - { - public: - std::vector map; - CopyMap *next; - CopyMap() - : - next(0) - { -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "CopyMap"); -#endif - - } - - void alloc_CopyMap(const MultiFab& mf) - { - - Vector IndArr = mf.IndexArray(); - const int n = IndArr.size(); - //const int n = mf.size(); - map.reserve(n); - //sMap.resize(n); - for (int i = 0; i < n; ++i) - { - int K = IndArr[i]; - const Box& tmp = mf.fabbox(K); - map.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - ~CopyMap() - { - for (int i = 0; i < map.size(); ++i) - { - delete map[i]; - } - map.clear(); - if(next !=0 ) - delete next; -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif - } - }; - - class RegionGraph - { - public: - static int graphCnt; - int graphID; - int numTasks; - int numFabs; - int totalFinishes; - bool isDepGraph; - bool* okToReset; - omp_lock_t finishLock; - - std::vector fabTiles; - std::vector fabTiles_gtbx; - - std::vector lMap; - std::vector sMap; - std::vector rMap; - CopyMap *sCopyMapHead; - CopyMap *rCopyMapHead; - - std::vector task; - std::vector worker; - - RegionGraph* srcLinkGraph; - - public: - RegionGraph(int numtasks); - void Initialize(); - void Reset(); - bool isGraphEmpty(); - bool isGraphEmptyV2(); - void finalizeGraph(); - void regionGraphReset(int numfabs); - void regionGraphMinReset(void); - void enableAllRegions(); - void disableRegion(int r, int tg); - void finalizeRegion(int r); - void finalizeRegionGraph(); - void regionComputed(int r); - bool isFireableRegion(int r); - int getAnyFireableRegion(); - int getAnyFireableRegion(RegionGraph& depGraph); - int getPulledFireableRegion(); - int getFireableRegion(bool isSingleThread=false); - void setFireableRegion(int r); - void graphTeardown(); - void workerTeardown(); - int size(){return task.size();} - - int getRegion(){ - return worker[perilla::wid()]->computedRegionQueue->getFrontRegion(true); - } - - void syncComputeWorkerThreads(){ - worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - void syncComputeWorkerThreads(int numthreads){ - worker[perilla::wid()]->barr->sync(numthreads); - } - - void syncWorkerThreads(){ - worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - void syncWorkerThreads(int numthreads){ - worker[perilla::wid()]->barr->sync(numthreads); - } - - void alloc_lMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - lMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - lMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void alloc_sMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - sMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - sMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void alloc_rMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - rMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - rMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void buildTileArray(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - fabTiles.resize(n); - - for (int i = 0; i < n; ++i) - { - fabTiles[i] = new pTileArray(); - } - for (MFIter mfi(mf, true); mfi.isValid(); ++mfi) - { - fabTiles[mfi.LocalIndex()]->numTiles++; - fabTiles[mfi.LocalIndex()]->tileBx.push_back(new Box(mfi.tilebox())); - fabTiles[mfi.LocalIndex()]->validBx = new Box(mfi.validbox()); - } - } - - void buildTileArray_gtbx(const MultiFab& mf, int ng) - { - const int n = mf.IndexArray().size(); - fabTiles_gtbx.resize(n); - - for (int i = 0; i < n; ++i) - { - fabTiles_gtbx[i] = new pTileArray(); - } - for (MFIter mfi(mf, true); mfi.isValid(); ++mfi) - { - fabTiles_gtbx[mfi.LocalIndex()]->numTiles++; - fabTiles_gtbx[mfi.LocalIndex()]->tileBx.push_back(new Box(mfi.growntilebox(ng))); - } - } - - void buildTileArray(const amrex::MultiFab& mf, const amrex::IntVect& tilesize) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - const int n = mf.indexArray.size(); - fabTiles.resize(n); - - //typ = mf.boxArray().ixType(); - - for (int i = 0; i < n; ++i) - { - fabTiles[i] = new pTileArray(); - } - for (amrex::MFIter mfi(mf, tilesize); mfi.isValid(); ++mfi) - { - if( fabTiles[mfi.LocalIndex()]->numTiles == 0 ) - fabTiles[mfi.LocalIndex()]->validBx = new amrex::Box(mfi.validbox()); - fabTiles[mfi.LocalIndex()]->numTiles++; - fabTiles[mfi.LocalIndex()]->tileBx.push_back(new amrex::Box(mfi.tilebox())); - } - } - - - ~RegionGraph(); - }; -}//end namespace - - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.cpp b/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.cpp deleted file mode 100755 index 895037152f8..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RegionGraph.cpp +++ /dev/null @@ -1,945 +0,0 @@ -#include -#include -#include -#include - -using namespace std; -using namespace amrex; -using namespace perilla; - -int RegionGraph::graphCnt = 0; - -RegionGraph::RegionGraph(int numtasks) -{ - sCopyMapHead = 0; - rCopyMapHead = 0; - srcLinkGraph = 0; - isDepGraph = false; - numFabs = numtasks; - numTasks = numtasks; - graphID = ++graphCnt; - worker.resize(perilla::NUM_THREAD_TEAMS); - task.resize(numTasks); - totalFinishes=0; - okToReset = new bool[perilla::NUM_THREAD_TEAMS]; - omp_init_lock(&finishLock); - Initialize(); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -void RegionGraph::Initialize() -{ - int numfabs = numTasks; - int numthreads = omp_get_num_threads(); - - if(numthreads==1) - { -#pragma omp parallel shared(numfabs) - { - int tg = WorkerThread::perilla_wid(); - - if(WorkerThread::perilla_isMasterWorkerThread()) - { - worker[tg] = new Worker(); - worker[tg]->barr = new Barrier(perilla::NUM_THREADS_PER_TEAM-1); - worker[tg]->l_barr = new Barrier(perilla::NUM_THREADS_PER_TEAM-2); - if(numfabs <= perilla::TASK_QUEUE_DEFAULT_MAXSIZE) - { - worker[tg]->fireableRegionQueue = new RegionQueue(); - worker[tg]->unfireableRegionQueue = new RegionQueue(); - worker[tg]->computedRegionQueue = new RegionQueue(); - worker[tg]->completedRegionQueue = new RegionQueue(); - } - else - { - worker[tg]->fireableRegionQueue = new RegionQueue(numfabs); - worker[tg]->unfireableRegionQueue = new RegionQueue(numfabs); - worker[tg]->computedRegionQueue = new RegionQueue(numfabs); - worker[tg]->completedRegionQueue = new RegionQueue(numfabs); - } - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - for(int f=0; f < numfabs; f++) - if(WorkerThread::isMyRegion(tg,f)) - { - task[f] = new Task(); - worker[tg]->unfireableRegionQueue->addRegion(f); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[f]->state[i] = 0; - task[f]->init = true; - } - worker[tg]->init = true; - okToReset[tg] = false; - } - }// omp parallel end - } - else // numthread are > 1, so already in parallel region - { - int tg = WorkerThread::perilla_wid(); - if(WorkerThread::perilla_isMasterWorkerThread() && worker[tg]->init == false ) - { - worker[tg]->barr = new Barrier(perilla::NUM_THREADS_PER_TEAM-1); - worker[tg]->l_barr = new Barrier(perilla::NUM_THREADS_PER_TEAM-2); - worker[tg]->fireableRegionQueue = new RegionQueue(); - worker[tg]->unfireableRegionQueue = new RegionQueue(); - worker[tg]->completedRegionQueue = new RegionQueue(); - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - for(int f=0; f < numfabs; f++) - if(WorkerThread::isMyRegion(tg,f)) - { - worker[tg]->unfireableRegionQueue->addRegion(f); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[f]->state[i] = 0; - task[f]->init = true; - } - worker[tg]->init = true; - } - } -} - -void RegionGraph::Reset() -{ - int tg= perilla::wid(); - omp_set_lock(&finishLock); - if(okToReset[tg]) - totalFinishes--; - omp_unset_lock(&finishLock); - - if(okToReset[tg]) - { - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - int r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[r]->state[i] = 0; - task[r]->init = true; - if(task[r]->depTaskIDs.size() > 0) - task[r]->depTasksCompleted = false; - } - else - break; - } - } -} - -bool RegionGraph::isGraphEmpty() -{ - int tg= perilla::wid(); - //worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - perilla::syncWorkerThreads(); - if(worker[tg]->completedRegionQueue->queueSize(true)== worker[tg]->totalTasks) - return true; - return false; -} - -bool RegionGraph::isGraphEmptyV2() -{ - int tg=perilla::wid(); - - if(worker[tg]->completedRegionQueue->queueSize(true) == worker[tg]->totalTasks || worker[tg]->computedTasks == worker[tg]->totalTasks) - return true; - return false; -} - -void RegionGraph::finalizeGraph() -{ - omp_set_lock(&finishLock); - totalFinishes++; - int tg=perilla::wid(); - okToReset[tg]=true; - omp_unset_lock(&finishLock); -} - -void RegionGraph::regionGraphReset(int numfabs) -{ - int nt; - int tg; - int r; - //#pragma omp parallel private(r,tg,nt,tid) shared(numfabs) - { - tg = perilla::wid(); - nt = perilla::wtid(); - if(perilla::isMasterThread()) - totalFinishes=0; - //#pragma omp barrier - if(perilla::isMasterWorkerThread()) - { - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[r]->state[i] = 0; - task[r]->init = true; - } - else - break; - } - okToReset[tg] = false; - } - }// omp parallel end -} - - -void RegionGraph::regionGraphMinReset(void) -{ - int nt; - int tg; - int r; - { - tg = perilla::wid(); - nt = perilla::wtid(); - if(perilla::isMasterThread()) - totalFinishes=0; - if(perilla::isMasterWorkerThread()) - { - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - } - else - break; - } - okToReset[tg] = false; - } - } -} - - -void RegionGraph::enableAllRegions() -{ - int numfabs = numTasks; - int r; - int tg = WorkerThread::perilla_wid(); - perilla::syncWorkerThreads(); - if(perilla::isMasterWorkerThread()) - for(int f=0; funfireableRegionQueue->removeRegion(true); - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - //worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - perilla::syncWorkerThreads(); -} - -void RegionGraph::disableRegion(int r, int tg) -{ - //int tg = perilla::wid(); - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rID = worker[tg]->fireableRegionQueue->removeRegion(true); - worker[tg]->unfireableRegionQueue->addRegion(rID,true); - } -} - -void RegionGraph::regionComputed(int r) -{ - int tg= perilla::wid(); - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-2); - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rr = worker[tg]->fireableRegionQueue->removeRegion(); - if(r != rr) - { - std::cout << "ERROR: In computedeRegion" << std::endl; - exit(EXIT_FAILURE); - } - worker[tg]->computedRegionQueue->addRegion(rr); - worker[tg]->computedTasks++; - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-2); -} - -void RegionGraph::finalizeRegion(int r) -{ - int tg= perilla::wid(); - int ntid=perilla::wtid(); - perilla::syncWorkerThreads(); - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rr = worker[tg]->fireableRegionQueue->removeRegion(true); - if(r != rr) - { - std::cout << "ERROR: In completeRegion" << std::endl; - exit(EXIT_FAILURE); - } - worker[tg]->completedRegionQueue->addRegion(rr,true); - } - perilla::syncWorkerThreads(); -} - -void RegionGraph::finalizeRegionGraph() -{ - int tg= perilla::wid(); - omp_set_lock(&finishLock); - totalFinishes++; - okToReset[tg]=true; - omp_unset_lock(&finishLock); -} - -bool RegionGraph::isFireableRegion(int r) -{ - int myProc = ParallelDescriptor::MyProc(); - FabCopyAssoc *cpDst = task[r]->cpAsc_dstHead; - if(lMap.size() > 0) - if(lMap[r]->l_con.firingRuleCnt != lMap[r]->l_con.ndcpy) - { - return false; - } - while(cpDst != 0) - { - if(cpDst->l_con.firingRuleCnt != cpDst->l_con.ndcpy) - { - return false; - } - cpDst = cpDst->next; - } - - if(srcLinkGraph != 0) - { - if(!task[r]->depTasksCompleted) - { - for(int i=0; idepTaskIDs.size(); i++) - if(!srcLinkGraph->isFireableRegion(task[r]->depTaskIDs[i])) - return false; - task[r]->depTasksCompleted = true; - } - } - - if(ParallelDescriptor::NProcs() == 1) return true; - - if(lMap.size() > 0) - if(lMap[r]->r_con.firingRuleCnt != lMap[r]->r_con.nrcv) - { - return false; - } - - cpDst = task[r]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->r_con.firingRuleCnt != cpDst->r_con.nrcv) - { - return false; - } - cpDst = cpDst->next; - } - return true; -} - -int RegionGraph::getFireableRegion(bool isSingleThread) -{ - int r = -1; - bool fireable; - int tg= perilla::wid(); - - if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0 && worker[tg]->fireableRegionQueue->queueSize() == 0) - { - fireable = false; - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - } - } - else if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0) - { - int unfQsize = worker[tg]->unfireableRegionQueue->queueSize(true); - for(int i = 0; i < unfQsize; i++) - { - int tr = worker[tg]->unfireableRegionQueue->removeRegion(true); - if(isFireableRegion(tr)) - { - r = tr; - break; - } - else - worker[tg]->unfireableRegionQueue->addRegion(tr,true); - } - } - - return r; -} - -#if 0 -int RegionGraph::getFireableRegion(bool patchFilled, bool isSingleThread) -{ - int r = -1; - bool fireable; - int tg= perilla::wid(); - int nt= perilla::wtid(); - - //if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0 && worker[tg]->fireableRegionQueue->queueSize() == 0) - //{ - if(!isSingleThread)worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(perilla::isMasterWorkerThread()) - { - if(worker[tg]->fireableRegionQueue->queueSize()==0){ - fireable = false; - assert(worker[tg]->unfireableRegionQueue->queueSize()>0); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r, patchFilled); - //fireable = true; - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } - } -#if 0 - else if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0) - { - int unfQsize = worker[tg]->unfireableRegionQueue->queueSize(true); - for(int i = 0; i < unfQsize; i++) - { - int tr = worker[tg]->unfireableRegionQueue->removeRegion(true); - if(isFireableRegion(tr)) - { - r = tr; - break; - } - else - worker[tg]->unfireableRegionQueue->addRegion(tr,true); - } - } -#endif - if(!isSingleThread)worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - std::cout<<"FOUND A REGION"<fireableRegionQueue->getFrontRegion(true); - return r; -} -#endif - - - -#if 0 -int RegionGraph::getFireableRegion(bool patchFilled) -{ - int r = -1; - bool fireable; - int tg= perilla::wid(); - int nt= perilla::wtid(); - - //if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0 && worker[tg]->fireableRegionQueue->queueSize() == 0) - //{ - worker[tg]->barr->sync(); // Barrier to synchronize team threads - if(nt == 0 && worker[tg]->fireableRegionQueue->queueSize()==0){ - fireable = false; - assert(worker[tg]->unfireableRegionQueue->queueSize()>0); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r, patchFilled); - //fireable = true; - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } -#if 0 - else if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0) - { - int unfQsize = worker[tg]->unfireableRegionQueue->queueSize(true); - for(int i = 0; i < unfQsize; i++) - { - int tr = worker[tg]->unfireableRegionQueue->removeRegion(true); - if(isFireableRegion(tr)) - { - r = tr; - break; - } - else - worker[tg]->unfireableRegionQueue->addRegion(tr,true); - } - } -#endif - worker[tg]->barr->sync(); // Barrier to synchronize team threads - r = worker[tg]->fireableRegionQueue->getFrontRegion(true); - return r; -} -#endif - -void RegionGraph::setFireableRegion(int r) -{ - worker[perilla::wid()]->fireableRegionQueue->addRegion(r); -} - - -int RegionGraph::getAnyFireableRegion() -{ - int myProc = ParallelDescriptor::MyProc(); - int tg = perilla::wid(); - int nt = perilla::wtid(); - int r; - perilla::syncWorkerThreads(); - if(nt ==0) - if(worker[tg]->fireableRegionQueue->queueSize()==0) - { - bool fireable = false; - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } - perilla::syncWorkerThreads(); - return worker[tg]->fireableRegionQueue->getFrontRegion(true); -} - -int RegionGraph::getAnyFireableRegion(RegionGraph& depGraph) -{ - int nt; - int tg; - int r; - bool fireable; - - int myProc = amrex::ParallelDescriptor::MyProc(); - - tg = perilla::wid(); - nt = perilla::wtid(); - if(nt == perilla::NUM_COMM_THREADS && worker[tg]->fireableRegionQueue->queueSize()==0) - { - fireable = false; - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - fireable &= depGraph.isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } - worker[tg]->barr->sync(); - //worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - r = worker[tg]->fireableRegionQueue->getFrontRegion(true); - return r; -} - - -int RegionGraph::getPulledFireableRegion() -{ - int myProc = ParallelDescriptor::MyProc(); - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - if(nt == 0 && worker[tg]->fireableRegionQueue->queueSize()==0) - { - while(worker[tg]->fireableRegionQueue->queueSize()==0); - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-2); - return worker[tg]->fireableRegionQueue->getFrontRegion(true); -} - -void RegionGraph::graphTeardown() -{ - MPI_Status status; - Package* package; - int numfabs = numTasks; - int tg = WorkerThread::perilla_wid(); - -#if 0 - for(int f=0; fcpAsc_dstHead; - while(cpDst != 0) - { - cpDst->l_con.firingRuleCnt = 0; - - for(int i=0; il_con.ndcpy; i++) - { - while(cpDst->l_con.dcpy[i].pQueue.queueSize() >= 1) - { - package = cpDst->l_con.dcpy[i].pQueue.dequeue(); - //package->completed = false; - //package->served = false; - //package->notified = false; - //package->request = MPI_REQUEST_NULL; - cpDst->l_con.dcpy[i].recycleQueue.enqueue(package); - } - } - - cpDst = cpDst->next; - } - } - } - - - for(int f=0; fcpAsc_srcHead; - while(cpSrc != 0) - { - //cpSrc->l_con.firingRuleCnt = 0; - - for(int i=0; il_con.nscpy; i++) - { - while(cpSrc->l_con.scpy[i].pQueue.queueSize() >= 1) - { - package = cpSrc->l_con.scpy[i].pQueue.dequeue(); - - FabCopyAssoc* cpDst = cpSrc->graphPartner->task[cpSrc->l_con.scpy[i].nd]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == this) //graphArray[g]) - break; - cpDst = cpDst->next; - } - //Package* sPackage = cpSrc->l_con.scpy[i].pQueue.dequeue(true); - omp_set_lock(&(cpDst->l_con.dLock)); - int dPartner = cpSrc->l_con.scpy[i].dPartner; - Package* dPackage = cpDst->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - /* - for(int j=0; jbufSize; j++) - { - dPackage->databuf[j] = sPackage->databuf[j]; - } - */ - std::memcpy(dPackage->databuf, package->databuf, dPackage->bufSize * sizeof(double)); - //std::swap(dPackage->databuf, sPackage->databuf); - - cpDst->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(cpDst->l_con.dcpy[dPartner].pQueue.queueSize(true) == 1) - cpDst->l_con.firingRuleCnt++; - omp_unset_lock(&(cpDst->l_con.dLock)); - //cpSrc->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - - //package->completed = false; - //package->served = false; - //package->notified = false; - //package->request = MPI_REQUEST_NULL; - cpSrc->l_con.scpy[i].recycleQueue.enqueue(package); - } - } - - cpSrc = cpSrc->next; - } - } - } - - - - for(int f=0; f 0) - { - lMap[f]->l_con.firingRuleCnt = 0; - } - } - } -#endif - - if(ParallelDescriptor::NProcs() == 1) return; - - - - for(int f=0; fcpAsc_dstHead; - while(cpDst != 0) - { - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; ir_con.nrcv; i++) - { - while(cpDst->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = cpDst->r_con.rcv[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - - cpDst = cpDst->next; - } - } - } - - - for(int f=0; fcpAsc_srcHead; - while(cpSrc != 0) - { - //cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; ir_con.nsnd; i++) - { - while(cpSrc->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = cpSrc->r_con.snd[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - cpSrc->r_con.snd[i].recycleQueue.enqueue(package); - } - } - - cpSrc = cpSrc->next; - } - } - } - - -#if 0 - if(tg == 0) - { - CopyMap* cpDst = rCopyMapHead; - while(cpDst != 0) - { - for(int f=0; fmap.size(); f++) - { - cpDst->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; imap[f]->r_con.nrcv; i++) - { - while(cpDst->map[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = cpDst->map[f]->r_con.rcv[i].pQueue.dequeue(); - if(package->request != MPI_REQUEST_NULL) - MPI_Cancel( &(package->request) ); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - cpDst->map[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - - } - - cpDst = cpDst->next; - } - - - CopyMap* cpSrc = sCopyMapHead; - while(cpSrc != 0) - { - for(int f=0; fmap.size(); f++) - { - for(int i=0; imap[f]->r_con.nsnd; i++) - { - while(cpSrc->map[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - - package = cpSrc->map[f]->r_con.snd[i].pQueue.dequeue(); - /* - int ns = cpSrc->map[f]->r_con.snd[i].ns; - int nd = cpSrc->map[f]->r_con.snd[i].nd; - int r_gid = cpSrc->map[f]->r_con.snd[i].r_gid; - int r_grids = cpSrc->map[f]->r_con.snd[i].r_grids; - //int tag = tagGen(ns, nd, r_gid-1, np*r_grids, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][cpSrc->map[f]->r_con.snd[i].sz]; - - Package* sPackage = lMap[f]->r_con.snd[i].pQueue.getFront(true); - package->request = ParallelDescriptor::Asend(sPackage->databuf, - cpSrc->map[f]->r_con.snd[i].sz, - cpSrc->map[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - - */ - MPI_Wait( &(package->request), &status ); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - cpSrc->map[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - } - - } - - cpSrc = cpSrc->next; - } - } - - //if(WorkerThread::isTeamMasterThread(tid)) commented out b/c its already call by single thread in a team - //Perilla::globalBarrier->sync(perilla::NUM_THREAD_TEAMS); - - // Parallel Copy Reset on Local tg - for(int f=0; f 0) - { - lMap[f]->r_con.firingRuleCnt = 0; - - for(int i=0; ir_con.nsnd; i++) - while(lMap[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = lMap[f]->r_con.snd[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - lMap[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - - for(int i=0; ir_con.nrcv; i++) - while(lMap[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = lMap[f]->r_con.rcv[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - lMap[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - } - } - - // Fill boundary reset on local tg - if(tg == 0) - { - for(int f=0; f 0) - { - // if(WorkerThread::isMyRegion(tg,f)) - { - for(int i=0; i< rMap[f]->r_con.nrcv; i++) - while( rMap[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = rMap[f]->r_con.rcv[i].pQueue.dequeue(); - if(package->request != MPI_REQUEST_NULL) - MPI_Cancel( &(package->request) ); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - rMap[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - for(int i=0; i< sMap[f]->r_con.nsnd; i++) - while( sMap[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = sMap[f]->r_con.snd[i].pQueue.dequeue(); - MPI_Wait( &(package->request), &status ); - package->completed = false; - package->served = false; - package->notified = false; - package->request = MPI_REQUEST_NULL; - sMap[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - } - } - } - } -#endif - -} - -void RegionGraph::workerTeardown() -{ - int numfabs = numTasks; - Package* package; - - regionGraphMinReset(); -} - -RegionGraph::~RegionGraph() -{ - delete[] okToReset; - for(int tg=0; tg -#include - -//////////////////////// class RegionQueue Declaration Start ///////////////////////////////////// -class RegionQueue -{ -private: - int* buffer; - int n; - int front; - int rear; - int max_size; - omp_lock_t queueLock; -public: - RegionQueue(); - RegionQueue(int numTasks); - ~RegionQueue(); - void addRegion(int r); - void addRegion(int r, bool lockIgnore); - int removeRegion(); - int removeRegion(bool lockIgnore); - int getFrontRegion(); - int getFrontRegion(bool lockIgnore); - int queueSize(bool lockIgnore); - int queueSize(); - void reallocate(); -}; -//////////////////////// class RegionQueue Declaration End ///////////////////////////////////// - - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/RegionQueue.cpp b/Src/AmrTask/rts_impls/mpi_omp/RegionQueue.cpp deleted file mode 100755 index 14a72c79138..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RegionQueue.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include -#include -#include -#include - -//////////////////////// class RegionQueue Definition Start ///////////////////////////////////// - RegionQueue::RegionQueue(void) - { - max_size= perilla::TASK_QUEUE_DEFAULT_MAXSIZE; - buffer = new int[max_size]; - n = 0; - front = 0; - rear = 0; - omp_init_lock(&queueLock); - } - - RegionQueue::RegionQueue(int numTasks) - { - buffer = new int[numTasks]; - n = 0; - max_size = numTasks; - front = 0; - rear = 0; - omp_init_lock(&queueLock); - } - - RegionQueue::~RegionQueue() - { - delete[] buffer; - } - - void RegionQueue::addRegion(int r) - { - omp_set_lock(&queueLock); - buffer[rear] = r; - rear = (rear+1)%max_size; - n++; - omp_unset_lock(&queueLock); - } - - void RegionQueue::addRegion(int r, bool lockIgnore) - { - if(!lockIgnore)omp_set_lock(&queueLock); - buffer[rear] = r; - rear = (rear+1)%max_size; - n++; - if(!lockIgnore)omp_unset_lock(&queueLock); - } - - int RegionQueue::removeRegion() - { - int r; - omp_set_lock(&queueLock); - r = buffer[front]; - front = (front+1)%max_size; - n--; - omp_unset_lock(&queueLock); - return r; - } - - int RegionQueue::removeRegion(bool lockIgnore) - { - int r; - if(!lockIgnore)omp_set_lock(&queueLock); - r = buffer[front]; - front = (front+1)%max_size; - n--; - if(!lockIgnore)omp_unset_lock(&queueLock); - return r; - } - - int RegionQueue::getFrontRegion() - { - return buffer[front]; - } - - int RegionQueue::getFrontRegion(bool lockIgnore) - { - if(!lockIgnore)omp_set_lock(&queueLock); - return buffer[front]; - if(!lockIgnore)omp_unset_lock(&queueLock); - } - - int RegionQueue::queueSize() - { - int size; - omp_set_lock(&queueLock); - size = n; - omp_unset_lock(&queueLock); - return size; - } - - int RegionQueue::queueSize(bool lockIgnore) - { - int size; - if(!lockIgnore)omp_set_lock(&queueLock); - size = n; - if(!lockIgnore)omp_unset_lock(&queueLock); - return size; - } -//////////////////////// class RegionQueue Definition End ///////////////////////////////////// diff --git a/Src/AmrTask/rts_impls/mpi_omp/RemoteConnection.H b/Src/AmrTask/rts_impls/mpi_omp/RemoteConnection.H deleted file mode 100755 index 47c16dac9bf..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/RemoteConnection.H +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef P_REMOTECONNECTION_H -#define P_REMOTECONNECTION_H - -#include -#include -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif - -class RemoteCommDescriptor -{ -public: - int ns, lns; // ! Source box in layout - int nd, lnd; //! Destination box in layout - int r_gid; - int r_grids; - int sz, pr; - Box sbx; // ! Sub-box for this copy - Box dbx; // ! Sub-box for this copy - PackageQueue pQueue; // !store incoming or outgoing messages, both fab and the runtime can access this queue - PackageQueue recycleQueue; //!store used messages, only fab can access this queue, no lock is required - int cnt; - RemoteCommDescriptor() : - ns(-1), lns(-1), - nd(-1), lnd(-1), - sz(0), pr(0), - cnt(0), r_gid(0), - r_grids(0) - { -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "RemoteCommDescriptor"); -#endif - } - ~RemoteCommDescriptor(){ -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif - } -}; - -class TransDescriptor -{ - int sz, pv, pr; -}; - -class RemoteConnection -{ -public: - int nsnd; - int nrcv; - bool remotePushReady; - bool remotePullDone; - int nrp, nsp; - omp_lock_t *sndLock, *rcvLock, *ghostLock; - int firingRuleCnt; - RemoteCommDescriptor *snd; - RemoteCommDescriptor *rcv; - TransDescriptor *str; - TransDescriptor *rtr; - RemoteConnection() : - nsnd(0), - nrcv(0), - remotePushReady(false), - remotePullDone(false), - nrp(0), nsp(0), - firingRuleCnt(0), - snd(NULL), rcv(NULL), - str(NULL), rtr(NULL) - { - sndLock = new omp_lock_t; - rcvLock = new omp_lock_t; - ghostLock = new omp_lock_t; - omp_init_lock(sndLock); - omp_init_lock(rcvLock); - omp_init_lock(ghostLock); - } - - ~RemoteConnection() - { - if(snd) - delete[] snd; - if(rcv) - delete[] rcv; - delete sndLock; - delete rcvLock; - delete ghostLock; - } -}; - - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.H b/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.H deleted file mode 100755 index 449ab0f096e..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.H +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef P_WORKERTHREAD_H -#define P_WORKERTHREAD_H - -#include -#include -#include - -namespace perilla{ - - class WorkerThread - { - static void* team_shared_memory[perilla::NUM_THREAD_TEAMS]; - int tid; - public: - static void init(); - static Barrier *globalBarrier; - static Barrier localBarriers[perilla::NUM_THREAD_TEAMS]; - static int perilla_tid(); - static int perilla_wtid(); - static int perilla_wid(); - static int perilla_nWorkerThreads(); - static int perilla_nWorkers(); - static int perilla_nTeamThreads(); - static bool perilla_isMasterWorkerThread(); - static bool perilla_isMasterThread(); - static bool perilla_isCommunicationThread(); - static bool isMyRegion(int workerID, int regionID); - static void setTeamSharedMemory(void* dummy, int tid, int tg); - static void* getTeamSharedMemory(int tg); - static void syncWorkers(); - static void syncThreads(); - static void syncComputeThreads(); - static void syncTeamThreads(); - static void syncAllThreads(); - static void syncAllComputeThreads(); - static void syncWorkerThreads(); - static void syncWorkerThreads(int numthreads); - static void syncComputeWorkerThreads(); - static void syncComputeWorkerThreads(int numthreads); - }; - - static int tid(){return WorkerThread::perilla_tid();} - static int wtid(){return WorkerThread::perilla_wtid();} - static int wid(){return WorkerThread::perilla_wid();} - static int nWorkerThreads(){return WorkerThread::perilla_nWorkerThreads();} - static int nWorkers(){return WorkerThread::perilla_nWorkers();} - static int nThreads(){return perilla::NUM_THREAD_TEAMS*perilla::NUM_THREADS_PER_TEAM;} - static bool isMasterWorkerThread(){return WorkerThread::perilla_isMasterWorkerThread();} - static bool isMasterThread(){return WorkerThread::perilla_isMasterThread();} - static bool isCommunicationThread(){return WorkerThread::perilla_isCommunicationThread();} - static bool isMyRegion(int workerID, int regionID){return WorkerThread::isMyRegion(workerID, regionID);} - static void setTeamSharedMemory(void* dummy, int tid, int tg){WorkerThread::setTeamSharedMemory(dummy, tid, tg);} - static void* getTeamSharedMemory(int tg){WorkerThread::getTeamSharedMemory(tg);} - static void syncWorkers(){WorkerThread::syncWorkers();} - static void syncThreads(){WorkerThread::syncThreads();} - static void syncComputeThreads(){WorkerThread::syncComputeThreads();} - static void syncComputeWorkerThreads(){WorkerThread::syncComputeWorkerThreads();} - static void syncComputeWorkerThreads(int numthreads){WorkerThread::syncComputeWorkerThreads(numthreads);} - static void syncWorkerThreads(){WorkerThread::syncWorkerThreads();} - static void syncWorkerThreads(int numthreads){WorkerThread::syncWorkerThreads(numthreads);} - static void syncAllWorkerThreads(){ - perilla::syncWorkerThreads(); - perilla::syncWorkers(); - } - static void syncAllComputeThreads(){ - perilla::syncAllComputeThreads(); - } - static void syncAllThreads(){ - WorkerThread::syncAllThreads(); - } -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.cpp b/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.cpp deleted file mode 100755 index 5609740ef3a..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/WorkerThread.cpp +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include - -namespace perilla -{ - void* WorkerThread::team_shared_memory[perilla::NUM_THREAD_TEAMS]; - Barrier* WorkerThread::globalBarrier; - Barrier WorkerThread::localBarriers[perilla::NUM_THREAD_TEAMS]; - - void WorkerThread::init(){ - WorkerThread::globalBarrier= new Barrier(perilla::NUM_THREAD_TEAMS); - } - - void WorkerThread::syncWorkers(){ - if(isMasterWorkerThread()) WorkerThread::globalBarrier->sync(perilla::NUM_THREAD_TEAMS); - } - - void WorkerThread::syncTeamThreads(){ - WorkerThread::localBarriers[perilla_wid()].sync(perilla::NUM_THREADS_PER_TEAM); - } - - void WorkerThread::syncWorkerThreads(){ - WorkerThread::localBarriers[perilla_wid()].sync(perilla::NUM_THREADS_PER_TEAM-1); - } - void WorkerThread::syncWorkerThreads(int numthreads){ - WorkerThread::localBarriers[perilla_wid()].sync(numthreads); - } - - void WorkerThread::syncAllComputeThreads(){ - syncWorkerThreads(); - syncWorkers(); - } - - void WorkerThread::syncAllThreads(){ - #pragma omp barrier - } - - void WorkerThread::syncThreads(){ - syncWorkerThreads(); - syncWorkers; - } - - void WorkerThread::syncComputeWorkerThreads(){ - WorkerThread::localBarriers[perilla_wid()].sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - void WorkerThread::syncComputeWorkerThreads(int numthreads){ - WorkerThread::localBarriers[perilla_wid()].sync(numthreads); - } - - int WorkerThread::perilla_tid(){ - return omp_get_thread_num(); - } - - int WorkerThread::perilla_nTeamThreads(){ - return perilla::NUM_THREADS_PER_TEAM; - } - - int WorkerThread::perilla_nWorkerThreads(){ - return perilla::NUM_THREADS_PER_TEAM-1; - } - int WorkerThread::perilla_nWorkers(){ - return perilla::NUM_THREAD_TEAMS; - } - - int WorkerThread::perilla_wtid() - { - int tid= omp_get_thread_num(); - return (tid % perilla::NUM_THREADS_PER_TEAM) -1; - } - - int WorkerThread::perilla_wid() - { - int tid= omp_get_thread_num(); - return tid / perilla::NUM_THREADS_PER_TEAM; - } - - bool WorkerThread::perilla_isMasterWorkerThread() - { - int tid= omp_get_thread_num(); - if((tid % perilla::NUM_THREADS_PER_TEAM)==1) - return true; - else - return false; - } - - bool WorkerThread::perilla_isMasterThread(){ - return perilla_tid()==1; - } - - bool WorkerThread::perilla_isCommunicationThread() - { - int tid= omp_get_thread_num(); - return (tid % perilla::NUM_THREADS_PER_TEAM)==0 ; - } - - bool WorkerThread::isMyRegion(int workerID, int regionID) - { - return ((regionID) % perilla::NUM_THREAD_TEAMS)==workerID; - } - - void WorkerThread::setTeamSharedMemory(void* dummy, int tid, int tg) - { - if((tid % perilla::NUM_THREADS_PER_TEAM)==1) - team_shared_memory[tg] = dummy; - } - - void* WorkerThread::getTeamSharedMemory(int tg) - { - return team_shared_memory[tg]; - } -}//end namepsace diff --git a/Src/AmrTask/rts_impls/mpi_omp/perilla.mak b/Src/AmrTask/rts_impls/mpi_omp/perilla.mak deleted file mode 100755 index 561fa15801f..00000000000 --- a/Src/AmrTask/rts_impls/mpi_omp/perilla.mak +++ /dev/null @@ -1,22 +0,0 @@ -CEXE_sources += Barrier.cpp -CEXE_sources += RGIter.cpp -CEXE_sources += PackageQueue.cpp -CEXE_sources += Perilla.cpp -CEXE_sources += RegionGraph.cpp -CEXE_sources += RegionQueue.cpp -CEXE_sources += WorkerThread.cpp -CEXE_sources += AsyncMultiFabUtil.cpp - - -CEXE_headers += Barrier.H -CEXE_headers += Config.H -CEXE_headers += LocalConnection.H -CEXE_headers += PackageQueue.H -CEXE_headers += RegionGraph.H -CEXE_headers += RegionQueue.H -CEXE_headers += RemoteConnection.H -CEXE_headers += WorkerThread.H -CEXE_headers += AsyncMultiFabUtil.H - - - diff --git a/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.H b/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.H deleted file mode 100755 index bf23f017626..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.H +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _AsyncMultiFabUtil_H_ -#define _AsyncMultiFabUtil_H_ - -#include -#include -#include -#include - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const int ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fine_geom, const Geometry& crse_geom, int scomp, int ncomp, const int ratio, int f); - -// Average fine cell-based MultiFab onto crse cell-centered MultiFab without volume weighting. -// This routine DOES NOT assume that the crse BoxArray is a coarsened version of the fine BoxArray. - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_push(Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int ratio, int f); - -void average_down_pull(MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int ratio, int f); - - -void average_down_push (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, MultiFab* crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - amrex::Geometry&, amrex::Geometry&,int scomp, int ncomp, const IntVect& ratio, int f); - -void average_down_pull (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, - amrex::Geometry&, amrex::Geometry&, int scomp, int ncomp, const IntVect& ratio, int f); -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.cpp b/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.cpp deleted file mode 100755 index 1ced35e1afe..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/AsyncMultiFabUtil.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include -//#include -#include -#include -#include -#include - -using namespace amrex; -using namespace perilla; - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, int rr, int f) -{ - average_down_push(amr,S_fine,S_crse,crse_S_fine,RG_fine,RG_crse,fgeom,cgeom,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, - const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, int rr, int f) -{ - average_down_pull(S_fine,S_crse,RG_fine,RG_crse,fgeom,cgeom,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - const Geometry& fgeom, const Geometry& cgeom, int scomp, int ncomp, const IntVect& ratio, int f) -{ - if (S_fine.is_nodal() || S_crse.is_nodal()) - { - amrex::Error("Can't use amrex::average_down for nodal MultiFab!"); - } - -#if (BL_SPACEDIM == 3) - average_down_push(amr, S_fine, S_crse, crse_S_fine, RG_fine, RG_crse, scomp, ncomp, ratio, f); - return; -#else - - assert(S_crse.nComp() == S_fine.nComp()); - - - MultiFab fvolume; - fgeom.GetVolume(fvolume, fine_BA, 0); - - int lfi = crse_S_fine.IndexArray()[f]; - const Box& tbx = crse_S_fine[ lfi ].box(); - - amrex_avgdown_with_vol(tbx,crse_S_fine[lfi].array(),S_fine[lfi].array(),fvolume[lfi].array(), - 0,scomp,ncomp,ratio); - - Perilla::multifabCopyPushAsync(RG_crse, RG_fine, &S_crse, &crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -#endif -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, const Geometry& fgeom, const Geometry& cgeom, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - - if (S_fine.is_nodal() || S_crse.is_nodal()) - { - amrex::Error("Can't use amrex::average_down for nodal MultiFab!"); - } - -#if (BL_SPACEDIM == 3) - average_down_pull(S_fine, S_crse, RG_fine, RG_crse, scomp, ncomp, ratio, f); - return; -#else - assert(S_crse.nComp() == S_fine.nComp()); - Perilla::multifabCopyPull(RG_crse, RG_fine, &S_crse, &S_fine, f, scomp, 0, ncomp, 0, 0, false); -#endif -} - -// ************************************************************************************************************* - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, int rr, int f) -{ - average_down_push(amr,S_fine,S_crse,crse_S_fine,RG_fine,RG_crse,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, int scomp, int ncomp, int rr, int f) -{ - average_down_pull(S_fine,S_crse,RG_fine,RG_crse,scomp,ncomp,rr*IntVect::TheUnitVector(),f); -} - -void average_down_push (Amr& amr, MultiFab& S_fine, MultiFab& S_crse, MultiFab& crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - assert(S_crse.nComp() == S_fine.nComp()); - - // NOTE: The tilebox is defined at the coarse level. - int lfi = crse_S_fine.IndexArray()[f]; - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - - for(int t=0; tfabTiles[f]->numTiles; t++) - if(t % (perilla::NUM_THREADS_PER_TEAM-1) == nt) - { - const Box& tbx = *(RG_fine->fabTiles[f]->tileBx[t]); - amrex_avgdown(tbx,crse_S_fine[lfi].array(),S_fine[lfi].array(),0,scomp,ncomp,ratio); - } - RG_fine->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - Perilla::multifabCopyPushAsync(RG_crse, RG_fine, &S_crse, &crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - -void average_down_pull (MultiFab& S_fine, MultiFab& S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - assert(S_crse.nComp() == S_fine.nComp()); - Perilla::multifabCopyPull(RG_crse, RG_fine, &S_crse, &S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - - -void average_down_push (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, MultiFab* crse_S_fine, RegionGraph* RG_fine, RegionGraph* RG_crse,amrex::Geometry& geom, amrex::Geometry& geom1, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - if(rgi.currentItr != rgi.totalItr) - return; - int tg = WorkerThread::perilla_wid(); - - f = rgi.currentRegion; - // NOTE: The tilebox is defined at the coarse level. - int lfi = crse_S_fine->IndexArray()[f]; - - // NOTE: We copy from component scomp of the fine fab into component 0 of the crse fab - // because the crse fab is a temporary which was made starting at comp 0, it is - // not part of the actual crse multifab which came in. - - //perilla::syncWorkerThreads(); - RG_fine->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - int nThreads= perilla::nWorkerThreads(); - for(int t=0; tfabTiles[f]->numTiles; t+= nThreads) - { - const Box& tbx = *(RG_fine->fabTiles[f]->tileBx[t]); - amrex_avgdown(tbx,(*crse_S_fine)[lfi].array(),(*S_fine)[lfi].array(),0,scomp,ncomp,ratio); - } - //perilla::syncWorkerThreads(); - RG_fine->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - Perilla::multifabCopyPush(RG_crse, RG_fine, S_crse, crse_S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - -void average_down_pull (RGIter& rgi, MultiFab* S_fine, MultiFab* S_crse, RegionGraph* RG_fine, RegionGraph* RG_crse, amrex::Geometry& geom, amrex::Geometry& geom1, - int scomp, int ncomp, const IntVect& ratio, int f) -{ - if(rgi.currentItr != 1) - return; - f = rgi.currentRegion; - - Perilla::multifabCopyPull(RG_crse, RG_fine, S_crse, S_fine, f, scomp, 0, ncomp, 0, 0, false); -} - - -#if 0 -#include "PerillaMemCheck.H" - -void PerillaMemCheck::add(string key, void* obj, string classname) -{ - lock.lock(); - if(objMap.find(key) == objMap.end()) - { - objMap[key]= obj; - printf("Adding an object\n"); - } - else{ - printf("Reinsert an object\n"); - exit(0); - } - lock.unlock(); -} - - -void PerillaMemCheck::remove(string key){ - lock.lock(); - if(objMap.find(key) != objMap.end()) - { - objMap.erase(key); - printf("Removing an object\n"); - } - else{ - printf("Object not found\n"); - exit(0); - } - - lock.unlock(); -} -void PerillaMemCheck::report(){ - if(objMap.size()) { - printf("Memory leak found\n"); - }else printf("all packages deallocated\n"); -} - - -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/Barrier.H b/Src/AmrTask/rts_impls/runtime_common/Barrier.H deleted file mode 100755 index 1ceb5caa579..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/Barrier.H +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef P_BARRIER_H -#define P_BARRIER_H -#include - -class Barrier -{ -private: - volatile int counter; - int maxThreads; - pthread_cond_t condition; - pthread_mutex_t condition_mutex; - volatile bool globalSense; -public: - Barrier(); - Barrier(int maxThreads); - void init(int maxThreads); - void sync(); - void sync(int numthreads); -}; -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/Barrier.cpp b/Src/AmrTask/rts_impls/runtime_common/Barrier.cpp deleted file mode 100755 index 9f905943acf..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/Barrier.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "Barrier.H" -#include -#include -#include - -Barrier::Barrier() -{ - //With this intializer, numthreads has to be specified when syncing, i.e. sync(numthreads) - counter = INT_MAX; - maxThreads=INT_MAX; - condition= PTHREAD_COND_INITIALIZER; - condition_mutex= PTHREAD_MUTEX_INITIALIZER; - globalSense = false; -} - -Barrier::Barrier(int numthreads) -{ -//With this initializer, both sync() and sync(numthreads) can be used - counter = numthreads; - maxThreads= numthreads; - condition= PTHREAD_COND_INITIALIZER; - condition_mutex= PTHREAD_MUTEX_INITIALIZER; - globalSense = false; -} - -void Barrier::init(int numthreads) -{ -//Similar to Barrier(int numthreads) - counter = numthreads; - maxThreads= numthreads; - condition= PTHREAD_COND_INITIALIZER; - condition_mutex= PTHREAD_MUTEX_INITIALIZER; - globalSense = false; -} - -void Barrier::sync() //sync all threads associated with this barrier -{ - assert(maxThreads -#include -#include -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif - -using namespace perilla; -using namespace amrex; - -class LocalCopyDescriptor -{ - public: - int ns; //Source box in layout - int nd; //Destination box in layout - int sz; - Box sbx; //Sub-box for this copy - Box dbx; //Sub-box for this copy - PackageQueue pQueue; //store incoming or outgoing messages, both fab and the runtime can access this queue - PackageQueue recycleQueue; //just for now, I'll replace this with a NUMA aware package allocator - int sPartner, dPartner; - int dcpyCnt,scpyCnt; - pthread_mutex_t ghostLock; - LocalCopyDescriptor() : ns(-1), nd(-1), scpyCnt(0), dcpyCnt(0), sz(0), sPartner(-1), dPartner(-1), ghostLock(PTHREAD_MUTEX_INITIALIZER) - { -#ifdef PERILLA_DEBUG -// memcheck.add(memcheck.genKey(this), (void*)this, "LocalCopyDescriptor"); -#endif - } - ~LocalCopyDescriptor(){ -#ifdef PERILLA_DEBUG -// memcheck.remove(memcheck.genKey(this)); -#endif - } -}; - -class LocalConnection -{ - public: - int nscpy; //Number of cpy chunks - int ndcpy; //Number of cpy chunks - pthread_mutex_t sLock, dLock, ghostLock; - int firingRuleCnt; - int scpyCnt, dcpyCnt; - Barrier *localBarrier; - LocalCopyDescriptor *scpy; - LocalCopyDescriptor *dcpy; - LocalConnection() : nscpy(0), ndcpy(0), firingRuleCnt(0), scpy(NULL), dcpy(NULL), scpyCnt(0), dcpyCnt(0), localBarrier(NULL),sLock(PTHREAD_MUTEX_INITIALIZER),dLock(PTHREAD_MUTEX_INITIALIZER),ghostLock(PTHREAD_MUTEX_INITIALIZER){ -// memcheck.add(memcheck.genKey(this), (void*)this, "LocalCopyDescriptor"); -} - - ~LocalConnection() - { - if(localBarrier) free(localBarrier); - if(scpy) delete [] scpy; - if(dcpy) delete [] dcpy; -// memcheck.remove(memcheck.genKey(this)); - } -}; -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/Make.package b/Src/AmrTask/rts_impls/runtime_common/Make.package deleted file mode 100644 index 137eb3063d5..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/Make.package +++ /dev/null @@ -1,8 +0,0 @@ -PERILLA_LIB=EXE - -C$(PERILLA_LIB)_sources += Barrier.cpp RGIter.cpp RegionQueue.cpp RegionGraph.cpp WorkerThread.cpp AsyncMultiFabUtil.cpp AsyncMultiFabUtil.cpp PerillaMemCheck.cpp Perilla_common.cpp - -C$(PERILLA_LIB)_headers += Barrier.H LocalConnection.H RGIter.H RegionQueue.H RemoteConnection.H WorkerThread.H RegionGraph.H AsyncMultiFabUtil.H PerillaMemCheck.H Perilla.H - -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common diff --git a/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.H b/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.H deleted file mode 100644 index f624aaceb7c..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.H +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef P_MEMCHECK_H -#define P_MEMCHECK_H -#include -#include -#include -#include "mylock.h" -using namespace std; - -namespace perilla{ - - class PerillaMemCheck - { - public: - void add(string key, void* obj, string classname); - void remove(string key); - string genKey(void *obj){ - std::stringstream ost; - ost< objMap; - int addCnt, rmCnt; - }; -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.cpp b/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.cpp deleted file mode 100644 index e77886eaa02..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/PerillaMemCheck.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "PerillaMemCheck.H" -using namespace perilla; - -void PerillaMemCheck::add(string key, void* obj, string classname) -{ - lock.lock(); - if(objMap.find(key) == objMap.end()) - { - objMap[key]= obj; - addCnt++; - } - else{ - printf("MemCheck Error: Reinsert an object\n"); - exit(0); - } - lock.unlock(); -} - - -void PerillaMemCheck::remove(string key){ - lock.lock(); - if(objMap.find(key) != objMap.end()) - { - objMap.erase(key); - rmCnt++; - } - else{ - printf("MemCheck Error: Object not found (%d Allocated vs %d Deleted)\n", addCnt, rmCnt); - exit(0); - } - - lock.unlock(); -} -void PerillaMemCheck::report(){ - if(objMap.size()) { - printf("Memory leak found: %d objects (%d Allocated vs %d Deleted)\n", objMap.size(), addCnt, rmCnt); - }else printf("All allocated objects have been deallocated (%d Allocated vs %d Deleted)\n", addCnt, rmCnt); -} - - diff --git a/Src/AmrTask/rts_impls/runtime_common/Perilla_common.cpp b/Src/AmrTask/rts_impls/runtime_common/Perilla_common.cpp deleted file mode 100755 index bf2de14eec3..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/Perilla_common.cpp +++ /dev/null @@ -1,3429 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -using namespace amrex; -using namespace perilla; - - -volatile int Perilla::numTeamsFinished = 0; -volatile int Perilla::updateMetadata_request = 0; -volatile int Perilla::updateMetadata_noticed = 0; -volatile int Perilla::updateMetadata_done = 0; -int Perilla::max_step=1; -std::map> Perilla::pTagCnt; -int Perilla::uTags=0; -bool Perilla::genTags=true; -std::map > > > > Perilla::tagMap; -std::map > > > Perilla::myTagMap; - -pthread_mutex_t table_lock= PTHREAD_MUTEX_INITIALIZER; -std::map tidTable; -void Perilla::registerId(int tid){ - pthread_mutex_lock(&table_lock); - tidTable[pthread_self()]= tid; - pthread_mutex_unlock(&table_lock); -} - -int Perilla::tid(){//this function can be called after all threads already register their ids - #ifdef USE_PERILLA_ON_DEMAND - return omp_get_thread_num(); - #else - return tidTable[pthread_self()]; - #endif -} - -void Perilla::clearTagMap(){ - Perilla::tagMap.clear(); -} - -void Perilla::clearMyTagMap(){ - Perilla::myTagMap.clear(); -} - - -void Perilla::flattenGraphHierarchy(std::vector > graphArrayHierarchy, std::vector &graphArray){ - graphArray.clear(); - int gCnt=0; - for(int l=0; l tags_t; - typedef std::map> stags_t; - typedef std::map>> dstags_t; - typedef std::map>>> gdstags_t; - typedef std::map>>>> pgdstags_t; - - int** tags = new int*[nPs]; - int** rtags = new int*[nPs]; - int* rTagCnt = new int[nPs*2]; - int* sTagCnt = new int[nPs*2]; - - MPI_Request *srrequest; - srrequest = new MPI_Request[nPs]; - MPI_Request *ssrequest; - ssrequest = new MPI_Request[nPs]; - MPI_Request *trrequest; - trrequest = new MPI_Request[nPs]; - MPI_Request *tsrequest; - tsrequest = new MPI_Request[nPs]; - - std::vector proc_communicated; - - proc_communicated.resize(nPs); - for(int p=0; psecond.begin(); it2 != it1->second.end(); it2++) - { - tac++; - tac++; - ng++; - for(dstags_t::iterator it3 = it2->second.begin(); it3 != it2->second.end(); it3++) - for(stags_t::iterator it4 = it3->second.begin(); it4 != it3->second.end(); it4++) - for(tags_t::iterator it5 = it4->second.begin(); it5 != it4->second.end(); it5++) - { - tac+=4; - } - } - sTagCnt[it1->first*2] = tac; - sTagCnt[it1->first*2+1] = ng; - tags[it1->first] = new int[sTagCnt[it1->first*2]]; - MPI_Isend(&sTagCnt[it1->first*2], 2, MPI_INT, it1->first, 1000, MPI_COMM_WORLD, &ssrequest[it1->first]); - proc_communicated[it1->first]=true; - } - - for(int p=0; psecond.begin(); it2 != it1->second.end(); it2++) - { - tags[it1->first][tac++] = it2->first; - tags[it1->first][tac++] = pTagCnt[it1->first][it2->first]; - int gtagc = 0; - for(dstags_t::iterator it3 = it2->second.begin(); it3 != it2->second.end(); it3++) - for(stags_t::iterator it4 = it3->second.begin(); it4 != it3->second.end(); it4++) - for(tags_t::iterator it5 = it4->second.begin(); it5 != it4->second.end(); it5++) - { - tags[it1->first][tac++] = it3->first; - tags[it1->first][tac++] = it4->first; - tags[it1->first][tac++] = it5->first; - tags[it1->first][tac++] = it5->second; - gtagc++; - } - BL_ASSERT(pTagCnt[it1->first][it2->first] == gtagc); - } - MPI_Isend(tags[it1->first], tac, MPI_INT, it1->first, 1001, MPI_COMM_WORLD, &tsrequest[it1->first]); - } - - MPI_Status status; - for(int p=0; p 0) - { - rtags[p] = new int[rTagCnt[p*2]]; - MPI_Irecv(rtags[p], rTagCnt[p*2], MPI_INT, p , 1001, MPI_COMM_WORLD, &trrequest[p]); - } - } - } - - - // //MPI_Irecv(size) Wait - - - //MPI_recive tags arra - for(int p=0; p 0) - { - MPI_Wait( &trrequest[p], &status ); - int tCnt=0; - for(int g=0; g 0) - { - delete[] rtags[p]; - } - } - - - for(int p=0; psize(); - const int n_snds_mf = TheFB.m_SndTags->size(); - const int n_rcvs_mf = TheFB.m_RcvTags->size(); - - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(n_snds_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_SndTags->begin(), - m_End = TheFB.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - Vector recv_cctc; - Vector recv_pr; - recv_cctc.reserve(n_rcvs_mf); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheFB.m_RcvTags->begin(), - m_End = TheFB.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - -//#pragma omp parallel shared(rg, mf, numfabs, np, TheFB, recv_cctc, send_cctc) - { - //int tg = omp_get_thread_num(); - int fg; -// if(WorkerThread::perilla_isCommunicationThread()) -//#pragma omp single - { - //bool cc = !mf->is_nodal(); // cc = multifab_cell_centered_q(mf) - //mf->sMap.reserve(numfabs); - //mf->rMap.reserve(numfabs); - //std::cout<< "Allocating sMap and rMap" <alloc_lMap(mf); - rg->alloc_sMap(mf); - rg->alloc_rMap(mf); - } -//#pragma omp barrier - //if(tid==0) - { - //bool cc = !mf->is_nodal(); // cc = multifab_cell_centered_q(mf) - //mf->sMap.reserve(numfabs); - //mf->rMap.reserve(numfabs); -//#pragma omp for - for(int f=0; flMap[f]->l_con.nscpy = 0; - - //for(int i=0; il_con.ncpy; i++) - for(int i=0; il_con.cpy[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == tag.srcIndex) - rg->lMap[f]->l_con.nscpy++; - //if(f == local_index(mf,bxasc->l_con.cpy[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == tag.dstIndex) - rg->lMap[f]->l_con.ndcpy++; - } - /* - if(rg->lMap[f]->l_con.nscpy+rg->lMap[f]->l_con.ndcpy != n_loc_mf) - std::cout<< "Diff in Sum " << rg->lMap[f]->l_con.nscpy << " " <lMap[f]->l_con.ndcpy << " " << n_loc_mf <lMap[f]->l_con.nscpy+rg->lMap[f]->l_con.ndcpy == n_loc_mf); - */ - } - } - } -//#pragma omp barrier - //now we know how many copying segments each fab owns as source and destination allocate memory for metadata -//#pragma omp for - for(int f=0; flMap[f]->l_con.sLock)); - //omp_init_lock(&(rg->lMap[f]->l_con.dLock)); - //omp_init_lock(&(rg->lMap[f]->l_con.ghostLock)); - - //std::cout<< "MF l_con nscpy " <lMap[f]->l_con.nscpy << " ndcpy " << rg->lMap[f]->l_con.ndcpy <lMap[f]->l_con.scpy = new LocalCopyDescriptor[rg->lMap[f]->l_con.nscpy]; - rg->lMap[f]->l_con.dcpy = new LocalCopyDescriptor[rg->lMap[f]->l_con.ndcpy]; - rg->lMap[f]->l_con.scpyCnt = 0; - rg->lMap[f]->l_con.dcpyCnt = 0; - } - } -//#pragma omp barrier - if(np > 1) - { -//#pragma omp for - for(int f=0; flMap[f]->r_con.nrcv = 0; - rg->lMap[f]->r_con.nsnd = 0; - rg->lMap[f]->r_con.firingRuleCnt = 0; - - //for(int i=0; ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex) - { - rg->lMap[f]->r_con.nsnd++; - } - } - } - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - rg->lMap[f]->r_con.nrcv++; - } - } - } - //rg->sMap[f]->r_con.sndLock = new omp_lock_t; - //rg->rMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->sMap[f]->r_con.sndLock); - //omp_init_lock(rg->rMap[f]->r_con.rcvLock); - rg->lMap[f]->r_con.snd = new RemoteCommDescriptor[rg->lMap[f]->r_con.nsnd]; - rg->lMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->lMap[f]->r_con.nrcv]; - } - } - // if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - { -//#pragma omp for - for(int f=0; frMap[f]->r_con.nrcv = 0; - rg->sMap[f]->r_con.nsnd = 0; - - //for(int i=0; ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex) - { - rg->sMap[f]->r_con.nsnd++; - } - } - } - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - rg->rMap[f]->r_con.nrcv++; - } - } - } - //rg->sMap[f]->r_con.sndLock = new omp_lock_t; - //rg->rMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->sMap[f]->r_con.sndLock); - //omp_init_lock(rg->rMap[f]->r_con.rcvLock); - rg->sMap[f]->r_con.snd = new RemoteCommDescriptor[rg->sMap[f]->r_con.nsnd]; - rg->rMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->rMap[f]->r_con.nrcv]; - } - } - } - } // omp parallel - //std::cout<< "counting done " <lMap[f]->l_con.localBarrier = new Barrier(perilla::NUM_THREADS_PER_TEAM-1); - // !create local communication meta data for sources and destinations - scnt = -1; - dcnt = -1; - //for(int i=0; il_con.ncpy; i++) - for(int i=0; il_con.cpy[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == tag.srcIndex) - { - scnt++; - //omp_init_lock(&(rg->lMap[f]->l_con.scpy[scnt].ghostLock)); - rg->lMap[f]->l_con.scpy[scnt].ns = mf.localindex(tag.srcIndex); //local_index(mf,bxasc->l_con.cpy[i].ns); //LocalIndex - rg->lMap[f]->l_con.scpy[scnt].nd = mf.localindex(tag.dstIndex); //local_index(mf,bxasc->l_con.cpy[i].nd); //LocalIndex - rg->lMap[f]->l_con.scpy[scnt].sbx = tag.sbox; //bxasc->l_con.cpy[i].sbx; - rg->lMap[f]->l_con.scpy[scnt].dbx = tag.dbox; //bxasc->l_con.cpy[i].dbx; - // !create queues for ghost cells - //call queue_init(mf%fbs(f)%l_con%scpy(scnt)%pQueue) - //call queue_init(mf%fbs(f)%l_con%scpy(scnt)%recycleQueue) - - int psize = tag.sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%l_con%scpy(scnt)%sbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - rg->lMap[f]->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->l_con.scpy[scnt].recycleQueue.enqueue(rg->lMap[f]->l_con.scpy[scnt].pQueue.dequeue()); - } - //if(f == local_index(mf,bxasc->l_con.cpy[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == tag.dstIndex) - { - dcnt++; - rg->lMap[f]->l_con.dcpy[dcnt].ns = mf.localindex(tag.srcIndex); //local_index(mf,bxasc->l_con.cpy[i].ns); //LocalIndex - rg->lMap[f]->l_con.dcpy[dcnt].nd = mf.localindex(tag.dstIndex); //local_index(mf,bxasc->l_con.cpy[i].nd); //LocalIndex - rg->lMap[f]->l_con.dcpy[dcnt].sbx = tag.sbox; //bxasc->l_con.cpy[i].sbx; - rg->lMap[f]->l_con.dcpy[dcnt].dbx = tag.dbox; //bxasc->l_con.cpy[i].dbx; - //call queue_init(mf%fbs(f)%l_con%dcpy(dcnt)%pQueue) - //call queue_init(mf%fbs(f)%l_con%dcpy(dcnt)%recycleQueue) - int psize = tag.dbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%l_con%dcpy(dcnt)%dbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - rg->lMap[f]->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->l_con.dcpy[dcnt].recycleQueue.enqueue(rg->lMap[f]->l_con.dcpy[dcnt].pQueue.dequeue()); - } - } // for(ilMap[f]->l_con.nscpy; i++) - for(int j=0; jlMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.ndcpy; j++) - if(rg->lMap[f]->l_con.scpy[i].dbx == rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[j].dbx) - rg->lMap[f]->l_con.scpy[i].dPartner = j; - - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - for(int j=0; jlMap[rg->lMap[f]->l_con.dcpy[i].ns]->l_con.nscpy; j++) - if(rg->lMap[f]->l_con.dcpy[i].dbx == rg->lMap[rg->lMap[f]->l_con.dcpy[i].ns]->l_con.scpy[j].dbx) - rg->lMap[f]->l_con.dcpy[i].sPartner = j; - } - } - if(np == 1) return; - - //std::cout<< "local init done" <lMap[f]->r_con.sndLock = new omp_lock_t; - //rg->lMap[f]->r_con.rcvLock = new omp_lock_t; - //omp_init_lock(rg->lMap[f]->r_con.sndLock); - //omp_init_lock(rg->lMap[f]->r_con.rcvLock); - //rg->lMap[f]->r_con.snd = new RemoteCommDescriptor[rg->lMap[f]->r_con.nsnd]; - //rg->lMap[f]->r_con.rcv = new RemoteCommDescriptor[rg->lMap[f]->r_con.nrcv]; - nrcv= -1; - //for(int i=0; ir_con.nrcv; i++) - for(int i=0; ir_con.rcv[i].nd)) //LocalIndex - if(mf.IndexArray()[f] == it->dstIndex) - { - nrcv++; - rg->lMap[f]->r_con.rcv[nrcv].ns = it->srcIndex; //bxasc->r_con.rcv[i].ns; - //rg->lMap[f]->r_con.rcv[nrcv].lnd = ; //local_index(mf,bxasc->r_con.rcv[i].nd); // not used anywhere so deferred ---------???????? - //rg->lMap[f]->r_con.rcv[nrcv].lns = -1; //undefined - rg->lMap[f]->r_con.rcv[nrcv].nd = it->dstIndex; //bxasc->r_con.rcv[i].nd; - rg->lMap[f]->r_con.rcv[nrcv].lnd = mf.localindex(it->dstIndex); - rg->lMap[f]->r_con.rcv[nrcv].lns = mf.localindex(it->srcIndex); - rg->lMap[f]->r_con.rcv[nrcv].sbx = it->sbox; //bxasc->r_con.rcv[i].sbx; - rg->lMap[f]->r_con.rcv[nrcv].dbx = it->dbox; //bxasc->r_con.rcv[i].dbx; - rg->lMap[f]->r_con.rcv[nrcv].pr = pr; //bxasc->r_con.rcv[i].pr; - rg->lMap[f]->r_con.rcv[nrcv].cnt = 0; - //!create queues for ghost cells - //call queue_init(mf%fbs(f)%r_con%rcv(nrcv)%pQueue) - //call queue_init(mf%fbs(f)%r_con%rcv(nrcv)%recycleQueue) - int psize = it->sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%r_con%rcv(nrcv)%dbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - rg->lMap[f]->r_con.rcv[nrcv].sz = psize; - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - rg->lMap[f]->r_con.rcv[nrcv].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->r_con.rcv[nrcv].recycleQueue.enqueue(rg->lMap[f]->r_con.rcv[nrcv].pQueue.dequeue()); - } - } - } // for(ir_con.nsnd; i++) - for(int i=0; ir_con.snd[i].ns)) //LocalIndex - if(mf.IndexArray()[f] == it->srcIndex ) - { - nsnd++; - rg->lMap[f]->r_con.snd[nsnd].ns = it->srcIndex; //bxasc->r_con.snd[i].ns; - rg->lMap[f]->r_con.snd[nsnd].nd = it->dstIndex; //bxasc->r_con.snd[i].nd; - //rg->lMap[f]->r_con.snd[nsnd].lns = ; //local_index(mf,bxasc->r_con.snd[i].ns); //not used anywhere so deferred ------????????? - //rg->lMap[f]->r_con.snd[nsnd].lnd = -1; //undefined - rg->lMap[f]->r_con.snd[nsnd].lns = mf.localindex(it->srcIndex); - rg->lMap[f]->r_con.snd[nsnd].lnd = mf.localindex(it->dstIndex); - rg->lMap[f]->r_con.snd[nsnd].sbx = it->sbox; //bxasc->r_con.snd[i].sbx; - rg->lMap[f]->r_con.snd[nsnd].dbx = it->dbox; //bxasc->r_con.snd[i].dbx; - rg->lMap[f]->r_con.snd[nsnd].pr = pr; //bxasc->r_con.snd[i].pr; - rg->lMap[f]->r_con.snd[nsnd].cnt = 0; - //!create queues for ghost cells - //call queue_init(mf%fbs(f)%r_con%snd(nsnd)%pQueue) - //call queue_init(mf%fbs(f)%r_con%snd(nsnd)%recycleQueue) - int psize = it->sbox.numPts() * mf.nComp(); //---------------------------------------------------------------???????????????? - /* - p => dataptr(mf%fbs(f), mf%fbs(f)%r_con%snd(nsnd)%sbx, 1, mf%nc) - s1= size(p,1) - s2= size(p,2) - s3= size(p,3) - s4= size(p,4) - s1*s2*s3*s4 - */ - rg->lMap[f]->r_con.snd[nsnd].sz = psize; - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - rg->lMap[f]->r_con.snd[nsnd].pQueue.enqueue(tmpPkg); - } - for(int p=0; plMap[f]->r_con.snd[nsnd].recycleQueue.enqueue(rg->lMap[f]->r_con.snd[nsnd].pQueue.dequeue()); - - //std::cout<< "RQ f "<< f << " i "<< nsnd <lMap[f]->r_con.nsnd; i++) - { - rg->sMap[f]->r_con.snd[i].ns = rg->lMap[f]->r_con.snd[i].ns; - rg->sMap[f]->r_con.snd[i].nd = rg->lMap[f]->r_con.snd[i].nd; - rg->sMap[f]->r_con.snd[i].lns = rg->lMap[f]->r_con.snd[i].lns; - rg->sMap[f]->r_con.snd[i].lnd = rg->lMap[f]->r_con.snd[i].lnd; - rg->sMap[f]->r_con.snd[i].r_gid = rg->graphID-1; - rg->sMap[f]->r_con.snd[i].r_grids = rg->numFabs; - rg->sMap[f]->r_con.snd[i].sbx = rg->lMap[f]->r_con.snd[i].sbx; - rg->sMap[f]->r_con.snd[i].dbx = rg->lMap[f]->r_con.snd[i].dbx; - rg->sMap[f]->r_con.snd[i].pr = rg->lMap[f]->r_con.snd[i].pr; - rg->sMap[f]->r_con.snd[i].sz = rg->lMap[f]->r_con.snd[i].sz; - rg->sMap[f]->r_con.snd[i].cnt = 0; - rg->lMap[f]->r_con.snd[i].cnt = 0; - - for(int p=0; plMap[f]->r_con.snd[i].sz); -#ifdef PERILLA_USE_UPCXX - void* local_ptr= tmpPkg->databuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jlMap[f]->r_con.snd[i].sz; j++) - ((double*)local_ptr)[j]= 0; -#else - for(int j=0; jlMap[f]->r_con.snd[i].sz; j++) - ((double*)tmpPkg->databuf)[j] = 0; -#endif - rg->sMap[f]->r_con.snd[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; psMap[f]->r_con.snd[i].recycleQueue.enqueue(rg->sMap[f]->r_con.snd[i].pQueue.dequeue()); - } - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - rg->rMap[f]->r_con.rcv[i].ns = rg->lMap[f]->r_con.rcv[i].ns; - rg->rMap[f]->r_con.rcv[i].nd = rg->lMap[f]->r_con.rcv[i].nd; - rg->rMap[f]->r_con.rcv[i].lns = rg->lMap[f]->r_con.rcv[i].lns; - rg->rMap[f]->r_con.rcv[i].lnd = rg->lMap[f]->r_con.rcv[i].lnd; - rg->rMap[f]->r_con.rcv[i].r_gid = rg->graphID-1; - rg->rMap[f]->r_con.rcv[i].r_grids = rg->numFabs; - rg->rMap[f]->r_con.rcv[i].sbx = rg->lMap[f]->r_con.rcv[i].sbx; - rg->rMap[f]->r_con.rcv[i].dbx = rg->lMap[f]->r_con.rcv[i].dbx; - rg->rMap[f]->r_con.rcv[i].pr = rg->lMap[f]->r_con.rcv[i].pr; - rg->rMap[f]->r_con.rcv[i].sz = rg->lMap[f]->r_con.rcv[i].sz; - rg->rMap[f]->r_con.rcv[i].cnt = 0; - rg->lMap[f]->r_con.rcv[i].cnt = 0; - - if(Perilla::genTags) - { - try - { - int rcv_pr = rg->rMap[f]->r_con.rcv[i].pr; - int dstIndex = rg->rMap[f]->r_con.rcv[i].nd; - int srcIndex = rg->rMap[f]->r_con.rcv[i].ns; - int psize = rg->rMap[f]->r_con.rcv[i].sz; - std::map::iterator itr = tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex].find(psize); - if( itr != tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex].end()) - { - //rg->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[rcv_pr][rg->graphID-1][dstIndex][srcIndex][psize] = Perilla::uTags++; - //rg->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[rcv_pr].find(rg->graphID-1); - if(itr2 != pTagCnt[rcv_pr].end()) - pTagCnt[rcv_pr][rg->graphID-1] = pTagCnt[rcv_pr][rg->graphID-1] + 1; - else - pTagCnt[rcv_pr][rg->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< rg->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[rcv_pr][rg->graphID][it->dstIndex][it->srcIndex] = pTagCnt[rcv_pr]; - - for(int p=0; plMap[f]->r_con.rcv[i].sz); -#ifdef PERILLA_USE_UPCXX - void* local_ptr= tmpPkg->databuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jlMap[f]->r_con.rcv[i].sz; j++) - ((double*)local_ptr)[j]= 0; -#else - for(int j=0; jlMap[f]->r_con.rcv[i].sz; j++) - ((double*)tmpPkg->databuf)[j] = 0; -#endif - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; prMap[f]->r_con.rcv[i].recycleQueue.enqueue(rg->rMap[f]->r_con.rcv[i].pQueue.dequeue()); - } - } - }// if(tid==0) - - }// omp parallel -}// multifabBuildFabCon - - -#if 0 -void Perilla::serviceLocalRequests(RegionGraph* rg, int tg) -{ - int numfabs = rg->lMap.size(); - - for(int f=0; flMap[f]->l_con.sLock)); - //if(lockSucceeded != 0) // 0-Fail, otherwise-Succeed - { - for(int i=0; ilMap[f]->l_con.nscpy; i++){ - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize()>0) - { - pthread_mutex_lock(&(rg->lMap[f]->l_con.sLock)); - assert(doublechecked==false); - Package *sPackage = rg->lMap[f]->l_con.scpy[i].pQueue.dequeue(); - if(perilla::LAZY_PUSH) - { - // Implemetation deffered. Currently not required - } - //if(graph->graphID == 1 && rg->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout<< "Processing gID 1 nd 1 from f " << f << " i " << i << std::endl; - pthread_mutex_lock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - int dPartner = rg->lMap[f]->l_con.scpy[i].dPartner; - - //if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.queueSize() == 0 ) - if(dPartner == -1) - std::cout<< " Caution rQ size dPrtn "<< rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.ndcpy << " " << dPartner <<" graph ID " <graphID<lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.queueSize() <lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - - //for(int j=0; jbufSize; j++) - //dPackage->databuf[j] = sPackage->databuf[j]; //copy data------------------------------??????????????? - - std::memcpy(dPackage->databuf, sPackage->databuf, dPackage->bufSize * sizeof(double)); - - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.queueSize(true)==1) - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - rg->lMap[f]->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - pthread_mutex_unlock(&(rg->lMap[f]->l_con.sLock)); - } - } - //pthread_mutex_unlock(&(rg->lMap[f]->l_con.sLock)); - }// if(!lock succeedded) - if(perilla::LAZY_PUSH) - { - // Implemetation deffered. Currently not required - } - }// if(tg==fg) - }// for(frMap.size(); - - // !we first post send and receive - for(int f=0; frMap[f]->r_con.rcvLock)); - //if(lockSucceeded != 0) - { - //if(pthread_mutex_trylock(&(rg->lMap[f]->r_con.rcvLock)) != 0) - { - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - nextsReq = true; - else - { - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - if(rearPackage->completed && rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - nextsReq = true; - else //!expected message is still on the way - nextsReq = false; - } - if(nextsReq) //!take a message from recycle pool and post a receive - { - pthread_mutex_lock(&(rg->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - int ns = rg->rMap[f]->r_con.rcv[i].ns; - int nd = rg->rMap[f]->r_con.rcv[i].nd; - int lnd = rg->rMap[f]->r_con.rcv[i].lnd; - int r_grids = rg->rMap[f]->r_con.rcv[i].r_grids; - //!create a package to keep track of receive requests - Package *rMetaPackage = rg->rMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - Package *rPackage = rg->lMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //tag = tagGen(mf%rMap(f)%r_con%rcv(i)%ns, mf%rMap(f)%r_con%rcv(i)%nd, gid, parallel_nprocs()*nfabs(mf), ngr)---------?????? - //int tag = tagGen(rg->rMap[f]->r_con.rcv[i].ns, rg->rMap[f]->r_con.rcv[i].nd, graphID-1, np*numfabs, nGraphs); - int tag = tagMap[rg->rMap[f]->r_con.rcv[i].pr][graphID][nd][ns][rg->rMap[f]->r_con.rcv[i].sz]; - - rMetaPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - //rMetaPackage->request = parallel_irecv_dv(rpackage%ptr%dataBuf,mf%rMap(f)%r_con%rcv(i)%sz, mf%rMap(f)%r_con%rcv(i)%pr, tag) --------- ???? - rMetaPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - rg->rMap[f]->r_con.rcv[i].sz, - rg->rMap[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(rg->rMap[f]->r_con.rcvLock)); - } - } - //pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - }// if(omp_test_lock) - //pthread_mutex_unlock(&(rg->rMap[f]->r_con.rcvLock)); - }// if(lockSucceeded) - }// for(fsMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //then !no message has been issued or all send requests have been fulfilled - nextrReq = false; - else - nextrReq = true; - - if(nextrReq) - { - Package *sMetaPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(!sMetaPackage->served) - { - Package *sPackage = rg->lMap[f]->r_con.snd[i].pQueue.getFront(true); - sMetaPackage->completed = false; - sMetaPackage->served = true; - sMetaPackage->request = MPI_REQUEST_NULL; - int ns = rg->sMap[f]->r_con.snd[i].ns; - int nd = rg->sMap[f]->r_con.snd[i].nd; - int r_gid = rg->sMap[f]->r_con.snd[i].r_gid; - int r_grids = rg->sMap[f]->r_con.snd[i].r_grids; - //tag = tagGen(mf%sMap(f)%r_con%snd(i)%ns, mf%sMap(f)%r_con%snd(i)%nd, gid, parallel_nprocs()*nfabs(mf), ngr) -??????? - //int tag = tagGen(rg->sMap[f]->r_con.snd[i].ns, rg->sMap[f]->r_con.snd[i].nd, graphID-1, np*numfabs, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][rg->sMap[f]->r_con.snd[i].sz]; - //int tag = myTagMap[graphID-1][rg->sMap[f]->r_con.snd[i].nd][rg->sMap[f]->r_con.snd[i].ns]; - //sMetaPackage%ptr%request = parallel_isend_dv(spackage%ptr%dataBuf,mf%sMap(f)%r_con%snd(i)%sz, mf%sMap(f)%r_con%snd(i)%pr, tag) --????? - sMetaPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - rg->sMap[f]->r_con.snd[i].sz, - rg->sMap[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - } - } - } // for(irMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - { - //if(pthread_mutex_trylock(&(rg->lMap[f]->r_con.rcvLock)) != 0) // 0-Fail, otherwise-Succeed - { - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true); - if(!rearPackage->completed) - { - bool flag = false; - int ret_flag; - MPI_Status status; - - std::cout<< "myP "<< myProc << " f "<< f << " i "<< i<< " Req "<request << std::endl; - - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(rearPackage%ptr%request) -------??????? - if(flag) - { - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - rearPackage->completeRequest(); - rg->lMap[f]->r_con.rcv[i].pQueue.getRear()->completeRequest(); - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - rg->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - } - } - //pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - } // if(omp_test_lock) - } // if(queueSize > 0) - } // for(ilMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - { - Package *frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served && !frontPackage->completed) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(frontPackage%ptr%request) -------??????? - if(flag) - { - pthread_mutex_lock(&(rg->sMap[f]->r_con.sndLock)); - frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - rg->sMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->sMap[f]->r_con.sndLock)); - pthread_mutex_lock(&(rg->lMap[f]->r_con.sndLock)); - frontPackage = rg->lMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - rg->lMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->lMap[f]->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for(inComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 ntid "<< ntid <lMap[f]->l_con.sLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::PACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.nscpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQenQ f " << f << " i " << i <lMap[f]->l_con.scpy[i].recycleQueue.getFront(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->l_con.scpy[i].sbx,0,nComp,sPackage->databuf); - - for(int d=0; dbufSize; d++) - if(sPackage->databuf[d] == 0) - { - //std::cout<< "in fbPush Sending 0 from f "<< f <databuf[d] != 0); - } - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() == graph->lMap[f]->l_con.scpy[i].sbx.bigEnd()) - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(0)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(1)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(2)==4) - // std::cout<< "Corner Push for f "<< f << " data0 " <databuf[0]<< " size " <bufSize << " se "<< graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() <worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 pQ enQ" <lMap[f]->l_con.nscpy; i++) - { - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQ enQ from f "<< f <lMap[f]->l_con.scpy[i].pQueue.enqueue( graph->lMap[f]->l_con.scpy[i].recycleQueue.dequeue(true),true ); - } - pthread_mutex_unlock(&(graph->lMap[f]->l_con.sLock)); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - } // if(LAZY_PUSH) - else - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->r_con.sndLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1))==ntid) - { - //std::cout << "RQS " << graph->lMap[f]->r_con.snd[i].recycleQueue.queueSize() << std::endl; - - Package *sndPackage = graph->lMap[f]->r_con.snd[i].recycleQueue.dequeue(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->r_con.snd[i].sbx,0,nComp,sndPackage->databuf); - graph->lMap[f]->r_con.snd[i].pQueue.enqueue( sndPackage,true ); - //!the local message handler will detect the change and notify the remote message handler =>read access - //!the remote message handler first modifies the front item of this queue, then it push this item back to the message pool - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - pthread_mutex_unlock(&(graph->lMap[f]->r_con.sndLock)); - pthread_mutex_lock(&(graph->sMap[f]->r_con.sndLock)); - for(int i=0; ilMap[f]->r_con.nsnd; i++) - graph->sMap[f]->r_con.snd[i].pQueue.enqueue( graph->sMap[f]->r_con.snd[i].recycleQueue.dequeue(true),true ); - pthread_mutex_unlock(&(graph->sMap[f]->r_con.sndLock)); - } - -} // fillBoundaryPush -#endif - - -#if 0 -void Perilla::fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT) -{ -exit(0); - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->l_con.dLock)); - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::LAZY_PUSH) - { } - else - { - if(perilla::UNPACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *dPackage = graph->lMap[f]->l_con.dcpy[i].pQueue.getFront(true); - - /*for(int d=0; dbufSize; d++) - if(dPackage->databuf[d] == 0) - { - //std::cout<< "in fbPull Reciving 0 for f "<< f <databuf[d] != 0); - }*/ - /* - if(f==0) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd() == graph->lMap[f]->l_con.dcpy[i].dbx.bigEnd()) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(0)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(1)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(2)==4) - std::cout<< "Corner Pull for f "<< f << " data0 " <databuf[0]<< " size " <bufSize <<" se " <lMap[f]->l_con.dcpy[i].dbx.smallEnd()<m_fabs_v[f]->copyFromMem(graph->lMap[f]->l_con.dcpy[i].dbx,0,nComp,dPackage->databuf); - } - } // if(UNPACKING_FINEGRAIN) - else - } // if(LAZY_PUSH) - else - - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - graph->lMap[f]->l_con.dcpy[i].recycleQueue.enqueue( graph->lMap[f]->l_con.dcpy[i].pQueue.dequeue(true),true ); - - graph->lMap[f]->l_con.firingRuleCnt = graph->lMap[f]->l_con.firingRuleCnt - graph->lMap[f]->l_con.ndcpy; - - graph->lMap[f]->l_con.scpyCnt = 0; - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if(graph->lMap[f]->l_con.dcpy[i].pQueue.queueSize(true) >= 1) - graph->lMap[f]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->l_con.dLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - { - pthread_mutex_lock(&(graph->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(graph->lMap[f]->r_con.rcvLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *rcvMetaPackage = graph->rMap[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graph->rMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - Package *rcvPackage = graph->lMap[f]->r_con.rcv[i].pQueue.dequeue(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->r_con.rcv[i].dbx,0,nComp,rcvPackage->databuf); - rcvPackage->completed = false; - graph->lMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - graph->lMap[f]->r_con.firingRuleCnt = graph->lMap[f]->r_con.firingRuleCnt - graph->lMap[f]->r_con.nrcv; - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if(graph->lMap[f]->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(graph->lMap[f]->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - graph->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(graph->rMap[f]->r_con.rcvLock)); - } - -} // fillBoundaryPull -#endif - - - - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -void Perilla::multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& mfDst, const MultiFab& mfSrc, int nc, int ng, int ngSrc, const Periodicity& period) -{ - // MultiFab* mfSrc = gSrc->assocMF; - // MultiFab* mfDst = gDst->assocMF; - int myProc = ParallelDescriptor::MyProc(); - int np = ParallelDescriptor::NProcs(); - - try{ - - if(true)//if(!(*mfSrc == *mfDst)) - { - if(ng > mfDst.nGrow()) cout <<"MULTIFAB_COPY_C: ng > mfDst.nGrow not supported in parallel copy"<< endl; - if(ngSrc > mfSrc.nGrow()) cout <<"MULTIFAB_COPY_C: ngSrc > mfSrc.nGrow"<< endl; - if(ngSrc > 0) - { - - // To be implemented - //do i = 1, nboxes(msrc%la) - // call push_back(bl, grow(box_nodalize(get_box(msrc%la,i),msrc%nodal),lngsrc)) - //end do - //call build(batmp, bl, sort = .false.) - //call destroy(bl) - //call build(lasrctmp, batmp, boxarray_bbox(batmp), explicit_mapping = get_proc(msrc%la)) - //call destroy(batmp) - //call build(msrctmp, lasrctmp, nc = lnc, ng = 0) - //pmfsrc => msrctmp - } - if(np > 1) - { - if(gSrc->sCopyMapHead == 0) - gSrc->sCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gSrc->sCopyMapHead; - gSrc->sCopyMapHead = tmpCopyMap; - } - if(gDst->rCopyMapHead == 0) - gDst->rCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gDst->rCopyMapHead; - gDst->rCopyMapHead = tmpCopyMap; - } - //gSrc->sCopyMapHead->map.reserve(mfSrc.size()); - //gDst->rCopyMapHead->map.reserve(mfDst.size()); - gSrc->sCopyMapHead->alloc_CopyMap(mfSrc); - gDst->rCopyMapHead->alloc_CopyMap(mfDst); - } - - if(gSrc->numTasks != mfSrc.IndexArray().size()) - std::cout<< "before " <numTasks << " now " <graphID << std::endl; - - gSrc->numFabs = mfSrc.size(); - gDst->numFabs = mfDst.size(); - - gSrc->numTasks = mfSrc.IndexArray().size(); - gDst->numTasks = mfDst.IndexArray().size(); - - int nfabsSrc = mfSrc.IndexArray().size(); - int nfabsDst = mfDst.IndexArray().size(); - - const FabArrayBase::CPC& TheCPC = mfDst.getCPC(IntVect(ng), mfSrc, IntVect(ngSrc), period); - - const int nloc_cpAsc = TheCPC.m_LocTags->size(); - const int nsnds_cpAsc = TheCPC.m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC.m_RcvTags->size(); - - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(nsnds_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_SndTags->begin(), - m_End = TheCPC.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - // std::cout<< "Loop 1" < recv_cctc; - Vector recv_pr; - recv_cctc.reserve(nrcvs_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_RcvTags->begin(), - m_End = TheCPC.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - - //std::cout<< "Before parallel at gID " << gDst->graphID << " numTask " << gDst->numTasks << " numFabs " << gDst->numFabs <graphID > 25) - //std::cout<< "Inside parallel Generating Send at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead == 0) - { - gSrc->task[f]->cpAsc_srcHead = new FabCopyAssoc(); - cpSrc = gSrc->task[f]->cpAsc_srcHead; - } - else - { - cpSrc = new FabCopyAssoc(); - cpSrc->next = gSrc->task[f]->cpAsc_srcHead; - gSrc->task[f]->cpAsc_srcHead = cpSrc; - } - - cpSrc->graphPartner = gDst; - cpSrc->l_con.nscpy = 0; - for(int i=0; il_con.nscpy++; - } - cpSrc->l_con.scpy = new LocalCopyDescriptor[cpSrc->l_con.nscpy]; - int scnt = 0; - //if(gDst->graphID == 4 && tag.dstIndex == 60 ) - //std::cout<< "Inside parallel Generating Local Copy send at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59) ) - //std::cout <<"myP " <l_con.scpy[scnt].ns = mfSrc.localindex(tag.srcIndex); - cpSrc->l_con.scpy[scnt].nd = mfDst.localindex(tag.dstIndex); - cpSrc->l_con.scpy[scnt].sbx = tag.sbox; - cpSrc->l_con.scpy[scnt].dbx = tag.dbox; - int psize = tag.sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - //std::cout<< " gSrc ID "<< gSrc->graphID << " f "<databuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpSrc->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.scpy[scnt].recycleQueue.enqueue(cpSrc->l_con.scpy[scnt].pQueue.dequeue()); - scnt++; - } - } - - if(np > 1) - { - cpSrc->r_con.nsnd = 0; - cpSrc->r_con.remotePushReady = false; - cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - cpSrc->r_con.nsnd++; - } - } // for(ir_con.snd = new RemoteCommDescriptor[cpSrc->r_con.nsnd]; - scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 17 && (it->srcIndex == 1198 || it->srcIndex == 1198 || it->srcIndex == 978 || it->srcIndex == 978)) - //std::cout <<"myP " <dstIndex << " s "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <r_con.snd[scnt].ns = it->srcIndex; - cpSrc->r_con.snd[scnt].nd = it->dstIndex; - cpSrc->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - cpSrc->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - cpSrc->r_con.snd[scnt].sbx = it->sbox; - cpSrc->r_con.snd[scnt].dbx = it->dbox; - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpSrc->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.snd[scnt].recycleQueue.enqueue(cpSrc->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // if(fg==tg) -//#pragma omp barrier - // std::cout<< "Barrier 1" < 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) -// if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Send tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <sCopyMapHead->map[f]->r_con.nsnd = 0; - gSrc->sCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - gSrc->sCopyMapHead->map[f]->r_con.nsnd++; - } - } // for(isCopyMapHead->map[f]->r_con.snd = new RemoteCommDescriptor[gSrc->sCopyMapHead->map[f]->r_con.nsnd]; - int scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 31 && (it->dstIndex == 519)) - //std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <sCopyMapHead->map[f]->r_con.snd[scnt].ns = it->srcIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].nd = it->dstIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_gid = gDst->graphID-1; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sbx = it->sbox; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].dbx = it->dbox; - - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sz = psize; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pr = send_pr[i]; - - for(int p=0; pdatabuf[j] = 0; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; psCopyMapHead->map[f]->r_con.snd[scnt].recycleQueue.enqueue(gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead == 0) - { - gDst->task[f]->cpAsc_dstHead = new FabCopyAssoc(); - cpDst = gDst->task[f]->cpAsc_dstHead; - } - else - { - cpDst = new FabCopyAssoc(); - cpDst->next = gDst->task[f]->cpAsc_dstHead; - gDst->task[f]->cpAsc_dstHead = cpDst; - } - cpDst->graphPartner = gSrc; - cpDst->l_con.ndcpy = 0; - cpDst->l_con.firingRuleCnt = 0; - cpDst->l_con.dcpyCnt = 0; - for(int i=0; il_con.ndcpy++; - } - cpDst->l_con.dcpy = new LocalCopyDescriptor[cpDst->l_con.ndcpy]; - int dcnt = 0; - - //if(gDst->graphID > 25) - //std::cout<< "Inside parallel Generating Local copy recive at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID ==27 && f == 633) - //std::cout<< "tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc << " ndcpy " << cpDst->l_con.ndcpy <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59)) - //std::cout<< "dcpy tid " << tid << " f " << f << " i " << i << " dcnt " << dcnt << " ns "<l_con.dcpy[dcnt].ns = mfSrc.localindex(tag.srcIndex); - cpDst->l_con.dcpy[dcnt].nd = mfDst.localindex(tag.dstIndex); - cpDst->l_con.dcpy[dcnt].sbx = tag.sbox; - cpDst->l_con.dcpy[dcnt].dbx = tag.dbox; - - int psize = tag.dbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - cpDst->l_con.dcpy[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpDst->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - - for(int p=0; pl_con.dcpy[dcnt].recycleQueue.enqueue(cpDst->l_con.dcpy[dcnt].pQueue.dequeue()); - - } - dcnt++; - } - } - - RegionGraph* depGraph = gDst->srcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; il_con.ndcpy ; i++) - { - for(int p=0; pl_con.dcpy[i].sz; - Package *tmpPkg = new Package(psize); -#ifdef PERILLA_USE_UPCXX - void* local_ptr= tmpPkg->databuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpdDst->l_con.dcpy[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.dcpy[i].recycleQueue.enqueue(cpdDst->l_con.dcpy[i].pQueue.dequeue()); - } - } - - if(np > 1) - { - cpDst->r_con.nrcv = 0; - cpDst->r_con.remotePullDone = false; - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - cpDst->r_con.nrcv++; - } - } // for(ir_con.rcv = new RemoteCommDescriptor[cpDst->r_con.nrcv]; - dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - cpDst->r_con.rcv[dcnt].nd = it->dstIndex; - cpDst->r_con.rcv[dcnt].ns = it->srcIndex; - cpDst->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - cpDst->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - cpDst->r_con.rcv[dcnt].sbx = it->sbox; - cpDst->r_con.rcv[dcnt].dbx = it->dbox; - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - cpDst->r_con.rcv[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpDst->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[dcnt].recycleQueue.enqueue(cpDst->r_con.rcv[dcnt].pQueue.dequeue()); - } - - dcnt++; - } - } - }// for(isrcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; ir_con.nrcv ; i++) - { - for(int p=0; pr_con.rcv[i].sz; - Package *tmpPkg = new Package(psize); -#ifdef PERILLA_USE_UPCXX - void* local_ptr= tmpPkg->databuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf)[j] = 0; -#endif - cpdDst->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[i].recycleQueue.enqueue(cpdDst->r_con.rcv[i].pQueue.dequeue()); - } - } - } // if(np > 1) - }// if(fg==tg) - -//#pragma omp barrier - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) -// if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Recive tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <rCopyMapHead->map[f]->r_con.nrcv = 0; - gDst->rCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - gDst->rCopyMapHead->map[f]->r_con.nrcv++; - } - } - gDst->rCopyMapHead->map[f]->r_con.rcv = new RemoteCommDescriptor[gDst->rCopyMapHead->map[f]->r_con.nrcv]; - int dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - - // if(myProc==54 && gDst->graphID == 25 && f == 10) - // std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " sgID "<< gSrc->graphID <<" tg "<rCopyMapHead->map[f]->r_con.rcv[dcnt].nd = it->dstIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].ns = it->srcIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sbx = it->sbox; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].dbx = it->dbox; - - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sz = psize; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pr = recv_pr[i]; - - BL_ASSERT(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd == f); - - if(Perilla::genTags) - { - try{ - std::map::iterator itr = tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].find(psize); - if( itr != tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].end()) - { - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex][psize] = Perilla::uTags++; - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[recv_pr[i]].find(gDst->graphID-1); - if(itr2 != pTagCnt[recv_pr[i]].end()) - pTagCnt[recv_pr[i]][gDst->graphID-1] = pTagCnt[recv_pr[i]][gDst->graphID-1] + 1; - else - pTagCnt[recv_pr[i]][gDst->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< gDst->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[recv_pr[i]][gDst->graphID][it->dstIndex][it->srcIndex] = pTagCnt[recv_pr[i]]; - - - for(int p=0; pdatabuf[j] = 0; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; prCopyMapHead->map[f]->r_con.rcv[dcnt].recycleQueue.enqueue(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.dequeue()); - dcnt++; - } - } - } // for(i 1) - //} //if(fg==tg) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Send partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead->l_con.nscpy; i++) - { - int lnd = gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].nd; - for(int j=0; jtask[ lnd ]->cpAsc_dstHead->l_con.ndcpy; j++) - if(gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dbx == gDst->task[ lnd ]->cpAsc_dstHead->l_con.dcpy[j].dbx) - gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dPartner = j; - } - } - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead->l_con.ndcpy; i++) - { - int lns = gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].ns; - for(int j=0; jtask[ lns ]->cpAsc_srcHead->l_con.nscpy; j++) - if(gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].dbx == gSrc->task[ lns ]->cpAsc_srcHead->l_con.scpy[j].dbx) - gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].sPartner = j; - } - } - } // for(fgraphID <<" "<< e.what() << '\n'; -} - - -//std::cout<< "All done safely at gID " << gDst->graphID <assocMF; - //MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - //if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - //if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpDst = destGraph->task[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == srcGraph) - break; - cpDst = cpDst->next; - } - if(cpDst == 0) cout <<"Metadata for across grid copy not found"<< endl; - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(singleT) - { - pthread_mutex_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue()); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - else - { - if(ntid==0) - pthread_mutex_lock(&(cpDst->l_con.dLock)); - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; il_con.ndcpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(ntid == 0) - { - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue()); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - //pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - for(int i=0; ir_con.nrcv; i++) - { - ///* - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); // corrected from pQ to recycleQ - - /* - Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage, true); - */ - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - //pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - } - else - { - if(ntid==0) - { - //pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nrcv; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - ///* - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); // corrected from pQ to recycleQ - - /*Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage, true); - */ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - // mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize() >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront()->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - //pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) - -} // multifabCopyPull -#endif - - - - -#if 0 -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - //double start_time_wtime = omp_get_wtime(); - - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - - //mTeams = false; - - //if(np==1) - //multifabCopyPull_1Team(destGraph,srcGraph,mfDst,mfSrc,f,dstcomp,srccomp,nc,ng,ngsrc,singleT); - /*else if(mTeams) - { - if(WorkerThread::isLocPPTID(tid)) - multifabCopyLocPull(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc); - else - multifabCopyRmtPull(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc); - } - else - multifabCopyPull_1Team(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc,singleT); -*/ - - if(!singleT) - srcGraph->worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - //double end_time_wtime = omp_get_wtime(); - //if(ntid==0) - //Perilla::getPPPTimeSplit[3] += end_time_wtime - start_time_wtime; -} -#endif - - -#if 0 -Array send_cctc; -Array send_pr; -Array recv_cctc; -Array recv_pr; - - -void Perilla::multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& mfDst, const MultiFab& mfSrc, int nc, int ng, int ngSrc, const Periodicity& period) -{ -#if 1 - int myProc = ParallelDescriptor::MyProc(); - int np = ParallelDescriptor::NProcs(); - try{ - if(true)//if(!(*mfSrc == *mfDst)) - { -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - if(ng > mfDst.nGrow()) cout <<"MULTIFAB_COPY_C: ng > mfDst.nGrow not supported in parallel copy"<< endl; - if(ngSrc > mfSrc.nGrow()) cout <<"MULTIFAB_COPY_C: ngSrc > mfSrc.nGrow"<< endl; - if(ngSrc > 0) - { - // To be implemented - //do i = 1, nboxes(msrc%la) - // call push_back(bl, grow(box_nodalize(get_box(msrc%la,i),msrc%nodal),lngsrc)) - //end do - //call build(batmp, bl, sort = .false.) - //call destroy(bl) - //call build(lasrctmp, batmp, boxarray_bbox(batmp), explicit_mapping = get_proc(msrc%la)) - //call destroy(batmp) - //call build(msrctmp, lasrctmp, nc = lnc, ng = 0) - //pmfsrc => msrctmp - assert(false); - } - if(np > 1) - { - if(gSrc->sCopyMapHead == 0) - gSrc->sCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gSrc->sCopyMapHead; - gSrc->sCopyMapHead = tmpCopyMap; - } - if(gDst->rCopyMapHead == 0) - gDst->rCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gDst->rCopyMapHead; - gDst->rCopyMapHead = tmpCopyMap; - } - //gSrc->sCopyMapHead->map.reserve(mfSrc.size()); - //gDst->rCopyMapHead->map.reserve(mfDst.size()); - gSrc->sCopyMapHead->alloc_CopyMap(mfSrc); - gDst->rCopyMapHead->alloc_CopyMap(mfDst); - } - - //if(gSrc->numTasks != mfSrc.IndexArray().size()) - // std::cout<< "before " <numTasks << " now " <graphID << std::endl; - - gSrc->numFabs = mfSrc.size(); - gDst->numFabs = mfDst.size(); - gSrc->numTasks = mfSrc.IndexArray().size(); - gDst->numTasks = mfDst.IndexArray().size(); - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - const FabArrayBase::CPC *TheCPC= &mfDst.getCPC(ng, mfSrc, ngSrc, period);; - - int nfabsSrc = mfSrc.IndexArray().size(); - int nfabsDst = mfDst.IndexArray().size(); - - const int nloc_cpAsc = TheCPC->m_LocTags->size(); - const int nsnds_cpAsc = TheCPC->m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC->m_RcvTags->size(); -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - - if(np > 1){ -#ifdef USE_PERILLA_PTHREADS -// if(perilla::isMasterThread()) -#endif - { - send_cctc.reserve(nsnds_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC->m_SndTags->begin(), - m_End = TheCPC->m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - recv_cctc.reserve(nrcvs_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC->m_RcvTags->begin(), - m_End = TheCPC->m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - } - } -#ifdef USE_PERILLA_PTHREADS -// perilla::syncAllThreads(); -#endif - -//#ifndef USE_PERILLA_PTHREADS - #pragma omp parallel shared(gSrc, gDst, mfSrc, mfDst, nfabsSrc, nfabsDst) -//#endif - { -#ifdef _OPENMP - int tid = omp_get_thread_num();//perilla::tid();//omp_get_thread_num(); -#else - int tid=0; -#endif - int tg = tid/perilla::NUM_THREADS_PER_TEAM;//perilla::wid();//WorkerThread::perilla_wid(); - int nt= tid%perilla::NUM_THREADS_PER_TEAM; - int fg; - //std::cout<<"thread "<< tid<<"group "<graphID << " numTask " << gDst->numTasks << " numFabs " << gDst->numFabs <graphID > 25) - //std::cout<< "Inside parallel Generating Send at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead == 0) - { - gSrc->task[f]->cpAsc_srcHead = new FabCopyAssoc(); - cpSrc = gSrc->task[f]->cpAsc_srcHead; - } - else - { - cpSrc = new FabCopyAssoc(); - cpSrc->next = gSrc->task[f]->cpAsc_srcHead; - gSrc->task[f]->cpAsc_srcHead = cpSrc; - } - - cpSrc->graphPartner = gDst; - cpSrc->l_con.nscpy = 0; - for(int i=0; im_LocTags)[i]; - //if(f == tag.srcIndex) - if(mfSrc.IndexArray()[f] == tag.srcIndex) - cpSrc->l_con.nscpy++; - } - cpSrc->l_con.scpy = new LocalCopyDescriptor[cpSrc->l_con.nscpy]; - - //if(gDst->graphID == 4 && tag.dstIndex == 60 ) - //std::cout<< "Inside parallel Generating Local Copy send at tid " << tid << " f " << f << " gID " << gDst->graphID <<" num local connections"<< nloc_cpAsc << std::endl; - - for(int i=0; im_LocTags)[i]; - //if(f == tag.srcIndex) - if(mfSrc.IndexArray()[f] == tag->srcIndex) - { - cpSrc->l_con.scpy[scnt].ns = mfSrc.localindex(tag->srcIndex); - cpSrc->l_con.scpy[scnt].nd = mfDst.localindex(tag->dstIndex); - cpSrc->l_con.scpy[scnt].sbx = tag->sbox; - cpSrc->l_con.scpy[scnt].dbx = tag->dbox; - int psize = tag->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - //std::cout<< " gSrc ID "<< gSrc->graphID << " f "<databuf[j] = 0; - cpSrc->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.scpy[scnt].recycleQueue.enqueue(cpSrc->l_con.scpy[scnt].pQueue.dequeue()); - scnt++; - } - } - - if(np > 1) - { - cpSrc->r_con.nsnd = 0; - cpSrc->r_con.remotePushReady = false; - cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - cpSrc->r_con.nsnd++; - } - } // for(ir_con.snd = new RemoteCommDescriptor[cpSrc->r_con.nsnd]; - scnt = 0; - for(int i=0; isrcIndex) - { - cpSrc->r_con.snd[scnt].ns = it->srcIndex; - cpSrc->r_con.snd[scnt].nd = it->dstIndex; - cpSrc->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - cpSrc->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - cpSrc->r_con.snd[scnt].sbx = it->sbox; - cpSrc->r_con.snd[scnt].dbx = it->dbox; - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - for(int p=0; pdatabuf[j] = 0; - cpSrc->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.snd[scnt].recycleQueue.enqueue(cpSrc->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // if(fg==tg) - - //perilla::syncAllThreads(); - #pragma omp barrier - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Send tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <sCopyMapHead->map[f]->r_con.nsnd = 0; - gSrc->sCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - gSrc->sCopyMapHead->map[f]->r_con.nsnd++; - } - } // for(isCopyMapHead->map[f]->r_con.snd = new RemoteCommDescriptor[gSrc->sCopyMapHead->map[f]->r_con.nsnd]; - int scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 31 && (it->dstIndex == 519)) - //std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <sCopyMapHead->map[f]->r_con.snd[scnt].ns = it->srcIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].nd = it->dstIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_gid = gDst->graphID-1; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sbx = it->sbox; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].dbx = it->dbox; - - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sz = psize; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pr = send_pr[i]; - - for(int p=0; pdatabuf[j] = 0; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; psCopyMapHead->map[f]->r_con.snd[scnt].recycleQueue.enqueue(gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead == 0) - { - gDst->task[f]->cpAsc_dstHead = new FabCopyAssoc(); - cpDst = gDst->task[f]->cpAsc_dstHead; - } - else - { - cpDst = new FabCopyAssoc(); - cpDst->next = gDst->task[f]->cpAsc_dstHead; - gDst->task[f]->cpAsc_dstHead = cpDst; - } - cpDst->graphPartner = gSrc; - cpDst->l_con.ndcpy = 0; - cpDst->l_con.firingRuleCnt = 0; - cpDst->l_con.dcpyCnt = 0; - for(int i=0; im_LocTags)[i]; - //if(f == tag.dstIndex) - if(mfDst.IndexArray()[f] == tag->dstIndex) - cpDst->l_con.ndcpy++; - } - cpDst->l_con.dcpy = new LocalCopyDescriptor[cpDst->l_con.ndcpy]; - int dcnt = 0; - - //if(gDst->graphID > 25) - //std::cout<< "Inside parallel Generating Local copy recive at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID ==27 && f == 633) - //std::cout<< "tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc << " ndcpy " << cpDst->l_con.ndcpy <m_LocTags)[i]; - //if(f == tag->dstIndex) - if(mfDst.IndexArray()[f] == tag->dstIndex) - { - - //if(gDst->graphID == 4 && (tag->dstIndex == 60 || tag->dstIndex == 59)) - //std::cout<< "dcpy tid " << tid << " f " << f << " i " << i << " dcnt " << dcnt << " ns "<srcIndex << " nd "<dstIndex << " lo " << tag->dbox.smallEnd() << " hi " << tag->dbox.bigEnd() <l_con.dcpy[dcnt].ns = mfSrc.localindex(tag->srcIndex); - cpDst->l_con.dcpy[dcnt].nd = mfDst.localindex(tag->dstIndex); - cpDst->l_con.dcpy[dcnt].sbx = tag->sbox; - cpDst->l_con.dcpy[dcnt].dbx = tag->dbox; - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating Package tid " << tid << " i " << i <dbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - cpDst->l_con.dcpy[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating now in reQ Package tid " << tid << " i " << i <l_con.dcpy[dcnt].recycleQueue.enqueue(cpDst->l_con.dcpy[dcnt].pQueue.dequeue()); - - //if(gDst->graphID > 25 && f == 633) - // std::cout<< " Generated Package tid " << tid << " i " << i <graphID > 25 && f > 630) - //std::cout<< "Safe now tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc <srcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; il_con.ndcpy ; i++) - { - for(int p=0; pl_con.dcpy[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->l_con.dcpy[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.dcpy[i].recycleQueue.enqueue(cpdDst->l_con.dcpy[i].pQueue.dequeue()); - } - } - - if(np > 1) - { - cpDst->r_con.nrcv = 0; - cpDst->r_con.remotePullDone = false; - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - cpDst->r_con.nrcv++; - } - } // for(ir_con.rcv = new RemoteCommDescriptor[cpDst->r_con.nrcv]; - dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - cpDst->r_con.rcv[dcnt].nd = it->dstIndex; - cpDst->r_con.rcv[dcnt].ns = it->srcIndex; - cpDst->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - cpDst->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - cpDst->r_con.rcv[dcnt].sbx = it->sbox; - cpDst->r_con.rcv[dcnt].dbx = it->dbox; - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - cpDst->r_con.rcv[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[dcnt].recycleQueue.enqueue(cpDst->r_con.rcv[dcnt].pQueue.dequeue()); - } - - dcnt++; - } - } - }// for(isrcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; ir_con.nrcv ; i++) - { - for(int p=0; pr_con.rcv[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[i].recycleQueue.enqueue(cpdDst->r_con.rcv[i].pQueue.dequeue()); - } - } - - - } // if(np > 1) - }// if(fg==tg) - - //perilla::syncAllThreads(); - #pragma omp barrier - - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - if(tid==0) - { - // std::cout<< "Inside parallel Generating Remote Recive tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <rCopyMapHead->map[f]->r_con.nrcv = 0; - gDst->rCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - gDst->rCopyMapHead->map[f]->r_con.nrcv++; - } - } - gDst->rCopyMapHead->map[f]->r_con.rcv = new RemoteCommDescriptor[gDst->rCopyMapHead->map[f]->r_con.nrcv]; - int dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - - // if(myProc==54 && gDst->graphID == 25 && f == 10) - // std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " sgID "<< gSrc->graphID <<" tg "<rCopyMapHead->map[f]->r_con.rcv[dcnt].nd = it->dstIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].ns = it->srcIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sbx = it->sbox; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].dbx = it->dbox; - - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sz = psize; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pr = recv_pr[i]; - - BL_ASSERT(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd == f); - - if(Perilla::genTags) - { - try{ - std::map::iterator itr = tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].find(psize); - if( itr != tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].end()) - { - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex][psize] = Perilla::uTags++; - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[recv_pr[i]].find(gDst->graphID-1); - if(itr2 != pTagCnt[recv_pr[i]].end()) - pTagCnt[recv_pr[i]][gDst->graphID-1] = pTagCnt[recv_pr[i]][gDst->graphID-1] + 1; - else - pTagCnt[recv_pr[i]][gDst->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< gDst->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[recv_pr[i]][gDst->graphID][it->dstIndex][it->srcIndex] = pTagCnt[recv_pr[i]]; - - - for(int p=0; pdatabuf[j] = 0; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; prCopyMapHead->map[f]->r_con.rcv[dcnt].recycleQueue.enqueue(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.dequeue()); - dcnt++; - } - } - } // for(i 1) - //} //if(fg==tg) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Send partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead->l_con.nscpy; i++) - { - int lnd = gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].nd; - for(int j=0; jtask[ lnd ]->cpAsc_dstHead->l_con.ndcpy; j++) - if(gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dbx == gDst->task[ lnd ]->cpAsc_dstHead->l_con.dcpy[j].dbx) - gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dPartner = j; - } - } - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead->l_con.ndcpy; i++) - { - int lns = gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].ns; - for(int j=0; jtask[ lns ]->cpAsc_srcHead->l_con.nscpy; j++) - if(gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].dbx == gSrc->task[ lns ]->cpAsc_srcHead->l_con.scpy[j].dbx) - gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].sPartner = j; - } - } - } // for(fgraphID <<" "<< e.what() << '\n'; -} - - -//std::cout<< "All done safely at gID " << gDst->graphID <assocMF; - // MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - //if(srcGraph->graphID==18 && f ==316 && ntid == 0) - //std::cout << "srgG chk see " << srcGraph << " " <graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - for(int i=0; il_con.nscpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - //if(myProc==26 && srcGraph->graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && f ==316) - //BL_ASSERT(cpSrc->r_con.nsnd == 177); - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - } - else - { - if(ntid == 0) - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - - // if(myProc==4 && srcGraph->graphID==2 && (f ==0 || f ==2)) - //std::cout << " Pushing 2 316 164"<r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - - } - - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(ntid==0) - { - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - //*/ - } - srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPushAsync -#endif - - -#if 0 -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int myProc = ParallelDescriptor::MyProc(); - - int ntid = WorkerThread::perilla_wtid(); - int tg = WorkerThread::perilla_wid(); - //MultiFab* mfDst = destGraph->assocMF; - //MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - //if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - //if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpDst = destGraph->task[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == srcGraph) - break; - cpDst = cpDst->next; - } - if(cpDst == 0) cout <<"Metadata for across grid copy not found"<< endl; - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(singleT) - { - pthread_mutex_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - else - { - if(ntid==0) - pthread_mutex_lock(&(cpDst->l_con.dLock)); - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; il_con.ndcpy; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(ntid == 0) - { - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - for(int i=0; ir_con.nrcv; i++) - { - ///* - Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - - } - else - { - if(ntid==0) - { - pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - for(int i=0; ir_con.nrcv; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - ///* - Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(ntid==0) - { - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - } - destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) - -} // multifabCopyPull -#endif - - - -#if 0 -void Perilla::serviceRemoteGridCopyRequests(std::vector graphArray, int g, int nGraphs, int tg) -{ - bool nextsReq, nextrReq; - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - //if(pthread_mutex_trylock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)) != 0) - { - //pthread_mutex_lock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - //if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - //if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - if(cpDst->r_con.rcv[i].pQueue.queueSize(true)==0) - { - nextrReq = true; - } - else - { - //Package *rearPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - // Also check the recycle queue because when rear is completed it may cause unlimited recv posts - //if(rearPackage->completed && graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.queueSize(true) > 1) //!latest receive request has been completed - if(rearPackage->completed && cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - { - nextrReq = true; - } - else //!expected message is still on the way - nextrReq = false; - } - if(nextrReq) //!take a message from recycle pool and post a receive - { - //pthread_mutex_lock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - //!create a package to keep track of receive requests - //Package *rMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - int ns = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].ns; - int nd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].nd; - int lnd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int r_grids = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].r_grids; - Package *rPackage = cpDst->r_con.rcv[i].recycleQueue.dequeue(true); - //int tag = tagGen(ns, nd, graphID-1, np*r_grids, nGraphs); - //int tag = Perilla::myTagMap[graphID-1][nd][ns]; - //int tag = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int tag = tagMap[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][g][nd][ns][graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz]; - - rPackage->request = MPI_REQUEST_NULL; - rPackage->completed=false; - cpDst->r_con.rcv[i].pQueue.enqueue(rPackage, true); //!this is not done yet - //graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage, true); //!this is not done yet - rPackage->request = ParallelDescriptor::Arecv(rPackage->databuf, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz, - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr, tag).req(); // tag == SeqNum in c++ ver - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - //pthread_mutex_unlock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - } - } // for (ir_con.nrcv) - } // if(ga locked) - //pthread_mutex_unlock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - //if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) == 0) - nextsReq = false; - else - nextsReq = true; - - if(nextsReq) //!take a message from recycle pool and post a receive - { - Package *sPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(!sPackage->served) - { - //Package *sMetaPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - sPackage->completed = false; - sPackage->served = true; - sPackage->request = MPI_REQUEST_NULL; - int ns = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].ns; - int nd = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].nd; - int r_gid = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_gid; - int r_grids = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_grids; - int tag = Perilla::myTagMap[r_gid][nd][ns][graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz]; - sPackage->request = ParallelDescriptor::Asend(sPackage->databuf, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz, - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - } - } - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - //if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - //if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - { - //Package *rearPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear(true); - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true); - if(rearPackage) - if(!rearPackage->completed) - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - bool flag = false; - int ret_flag=0; - MPI_Status status; - ParallelDescriptor::Test(rearPackage->request, ret_flag, status); - - flag = (ret_flag == 0) ? false : true;//parallel_test_one(rearPackage%ptr%request) -------??????? - if(flag) - { - rearPackage->completeRequest(); - cpDst->r_con.rcv[i].pQueue.getRear(true)->completeRequest(); - - //if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) - { - cpDst->r_con.firingRuleCnt++; - } - } - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } // if(pQueue.queueSize(true) > 0) - } // for (ir_con.nrcv) - } // if(ga locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - //if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) >0) - { - //Package *frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - Package *frontPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served && !frontPackage->completed) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - MPI_Status status; - ParallelDescriptor::Test(frontPackage->request, ret_flag, status); - flag = (ret_flag == 0) ? false : true;//parallel_test_one(frontPackage%ptr%request) -------??????? - if(flag) - { - - //pthread_mutex_lock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - frontPackage = cpSrc->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - cpSrc->r_con.snd[i].recycleQueue.enqueue(frontPackage, true); - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - -/* - frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = MPI_REQUEST_NULL; - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage, true); -*/ - //pthread_mutex_unlock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - //MultiFab* mf = graphArray[g]->assocMF; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - if(cpSrc->r_con.remotePushReady) - { - pthread_mutex_lock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - } - pthread_mutex_unlock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - cpSrc->r_con.remotePushReady = false; - }// if remotepushready - cpSrc = cpSrc->next; - } - }// ismyRegion - }//for ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(pthread_mutex_trylock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)) != 0) - { - if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - 1; - - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpDst->r_con.remotePullDone = false; - - //if(f==1 && g==26 && myProc == 54) - // std::cout<<"Completed Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.rcvLock)); - } // if(ga locked) - pthread_mutex_unlock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - /* - if(false) - for(int id=0; idtask[f]->depTaskIDs.size(); id++) - { - int df = graphArray[g]->task[f]->depTaskIDs[id]; - if(WorkerThread::isMyRegion(0,df)) - { - int lgID = graphArray[g]->srcLinkGraph->graphID-1; - - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Dep Push f " << df << " gID " << lgID+1 << " myP " << myProc <task[df]->cpAsc_dstHead; - while(cpdDst != 0) - { - if(omp_test_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock) != 0) - { - if(omp_test_lock(cpdDst->r_con.rcvLock) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpdDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->completed = false; - cpdDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpdDst->r_con.firingRuleCnt = cpdDst->r_con.firingRuleCnt - 1; - - if(cpdDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpdDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpdDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpdDst->r_con.remotePullDone = false; - - //if(df==10 && lgID==24 && myProc == 54) - // std::cout<<"Completed Push f " << df << " gID " << lgID+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.rcvLock); - } // if(ga locked) - omp_unset_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock); - } // if(mf locked) - cpdDst = cpdDst->next; - } // while(cpdDst != 0) - - - } // if tg==0 region - - - } // for all dependents - */ - } - } // for(f -//#include -//#include -#include - -namespace amrex{ - - class AsyncFillPatchIterator; - - class RGIter - { - public: - int tid; - int ntid; - int tg; - int currentRegion; - int currentTile; - int totalItr; - int currentItr; - bool tiling; - bool implicit; - bool ppteams; - bool haveDepGraph; - RegionGraph* itrGraph; - RegionGraph* depGraph; - int boxGrow, index, scomp, ncomp, iteration; - double time; - double getFireableTime; - amrex::MultiFab *_dest; - - IndexType typ; - - Vector m_level_afpi; - Vector m_upper_level_afpi; - std::ofstream fout; - - public: - RGIter(RegionGraph* rg -#ifdef USE_PERILLA_ON_DEMAND - ,std::vector graphArray -#endif - , bool enableAllTasks=false); - RGIter(RegionGraph* rg -#ifdef USE_PERILLA_ON_DEMAND - ,std::vector graphArray -#endif - , RegionGraph* drg, bool isDep=true); - RGIter(amrex::AsyncFillPatchIterator* afpi, bool enableAllTasks=false); - RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr); - ~RGIter(); - - void init(); - void sync_workers(); - //! Increment iterator to the next tile we own. - void operator++ (); - //! Is the iterator valid, are more regions to iterate over? - bool isValid(); - int LocalIndex() const { return currentRegion; } - void exec(); - - amrex::Box tileBox(); - amrex::Box validBox() const; - amrex::Box tilebox(); - amrex::Box growntilebox(); - amrex::Box growntilebox(int ng); - amrex::Box nodaltilebox(int dir); - }; -} -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/RGIter.cpp b/Src/AmrTask/rts_impls/runtime_common/RGIter.cpp deleted file mode 100755 index 0a456199da8..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/RGIter.cpp +++ /dev/null @@ -1,639 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -using namespace perilla; -#include - -#ifdef USE_PERILLA_ON_DEMAND - pthread_mutex_t teamFinLock=PTHREAD_MUTEX_INITIALIZER; -#endif - -namespace amrex{ - - RGIter::RGIter(RegionGraph* rg -#ifdef USE_PERILLA_ON_DEMAND - ,std::vector graphArray -#endif - , bool enableAllTasks - ): - itrGraph(rg), - implicit(false), - ppteams(true), - //typ(rg->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); -#ifdef USE_PERILLA_ON_DEMAND - if(tid==0)Perilla::syncProcesses(); - Perilla::numTeamsFinished=0; - #pragma omp barrier - if(perilla::isCommunicationThread()) - { - while(true){ - Perilla::serviceMultipleGraphCommDynamic(graphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - /*perilla::syncWorkers(ntid); - if(tg==0){ - graphArray.clear(); - Perilla::numTeamsFinished=0; - } - perilla::syncWorkers(ntid);*/ - break; - } - } - }else{ -#endif - itrGraph->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - itrGraph->Reset(); - itrGraph->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(enableAllTasks) - itrGraph->enableAllRegions(); - itrGraph->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - init(); -#ifdef USE_PERILLA_ON_DEMAND - } -#endif - } - - RGIter::RGIter(RegionGraph* rg -#ifdef USE_PERILLA_ON_DEMAND - ,std::vector graphArray -#endif - , RegionGraph* drg, bool isDep - ): - itrGraph(rg), - implicit(false), - ppteams(true), - //typ(rg->typ), - haveDepGraph(isDep), - depGraph(drg), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - -#ifdef USE_PERILLA_ON_DEMAND - if(tid==0)Perilla::syncProcesses(); - Perilla::numTeamsFinished=0; - #pragma omp barrier - if(perilla::isCommunicationThread()) - { - //Perilla::flattenGraphHierarchy(m_level_afpi[iteration-1]->m_amrlevel.parent->graphArray, graphArray); - while(true){ - Perilla::serviceMultipleGraphCommDynamic(graphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - perilla::syncWorkers(ntid); - if(tg==0){ - graphArray.clear(); - //Perilla::numTeamsFinished=0; - } - perilla::syncWorkers(ntid); - break; - } - } - }else{ -#endif - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) itrGraph->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - init(); -#ifdef USE_PERILLA_ON_DEMAND - } -#endif - } - - RGIter::RGIter(amrex::AsyncFillPatchIterator* afpi, bool enableAllTasks): - itrGraph(afpi->destGraph), - implicit(false), - ppteams(true), - //typ(afpi->destGraph->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); -#ifdef USE_PERILLA_ON_DEMAND - if(tid==0)Perilla::syncProcesses(); - Perilla::numTeamsFinished=0; - #pragma omp barrier - - if(perilla::isCommunicationThread()) - { - std::vector flattenedGraphArray; - Perilla::flattenGraphHierarchy(m_level_afpi[iteration-1]->m_amrlevel.parent->graphArray, flattenedGraphArray); - while(true){ - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - /*perilla::syncWorkers(ntid); - flattenedGraphArray.clear(); - if(tg==0) Perilla::numTeamsFinished=0; - perilla::syncWorkers(ntid);*/ - break; - } - } - }else{ -#endif - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - afpi->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(enableAllTasks) - itrGraph->enableAllRegions(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - init(); -#ifdef USE_PERILLA_ON_DEMAND - } -#endif - } - -#ifndef USE_PERILLA_ON_DEMAND - RGIter::RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr): - itrGraph(afpi[itr-1]->destGraph), - m_level_afpi(afpi), - m_upper_level_afpi(upper_afpi), - boxGrow(bG), - time(tm), - index(ind), - scomp(sc), - ncomp(nc), - iteration(itr), - implicit(true), - ppteams(true), - //typ(afpi[itr-1]->destGraph->typ), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - bool push = true; - - tid = perilla::tid(); - tg = perilla::wid(); - ntid = perilla::wtid(); - AsyncFillPatchIterator::initialSend(afpi, upper_afpi, bG, tm, ind, 0, nc, itr); - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - m_level_afpi[iteration-1]->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - if(ntid == perilla::NUM_THREADS_PER_TEAM-2) - { - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; - - //////////////////////////////////////Push Pull Thread Start///////////////////////// - while(m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->queueSize(true) != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks || - m_level_afpi[iteration-1]->destGraph->worker[tg]->computedTasks != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks) - { - f = m_level_afpi[iteration-1]->destGraph->getFireableRegion(tg); - if(f != -1) - { - m_level_afpi[iteration-1]->Receive(this,dest,boxGrow,time,index,scomp,ncomp,f,true); - m_level_afpi[iteration-1]->destGraph->setFireableRegion(f); - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->unfireableRegionQueue->queueSize(true) !=0 && - m_level_afpi[iteration-1]->destGraph->worker[tg]->fireableRegionQueue->queueSize(true) < 2) - continue; - } - - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->queueSize() != 0) - { - f = m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->removeRegion(); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(*(this),boxGrow,time+dt,index,scomp,ncomp,iteration,f,true); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,true); - } - } - m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } - //fout.close(); - ////////////////////////////////////////////////////////Push Pull Thread End//////////////////// - } - else - { - //fout << "Calling init "<< std::endl; - //fout.close(); - init(); - } - } - -#else - - RGIter::RGIter(Vector afpi, Vector upper_afpi, - amrex::MultiFab& dest, int bG, double tm, int ind, int sc, int nc, int itr) -: - itrGraph(afpi[itr-1]->destGraph), - m_level_afpi(afpi), - m_upper_level_afpi(upper_afpi), - _dest(&dest), - boxGrow(bG), - time(tm), - index(ind), - scomp(sc), - ncomp(nc), - iteration(itr), - implicit(true), - ppteams(true), - haveDepGraph(false), - depGraph(NULL), - getFireableTime(0.) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - bool push = true; - tid = perilla::tid(); - tg = perilla::wid(); - ntid= perilla::wtid(); - if(tid==0)Perilla::syncProcesses(); - Perilla::numTeamsFinished=0; - #pragma omp barrier - - if(perilla::isCommunicationThread()) - { - std::vector flattenedGraphArray; - Perilla::flattenGraphHierarchy(m_level_afpi[iteration-1]->m_amrlevel.parent->graphArray, flattenedGraphArray); - while(true){ - Perilla::serviceMultipleGraphCommDynamic(flattenedGraphArray,true,perilla::tid()); - if( Perilla::numTeamsFinished == perilla::NUM_THREAD_TEAMS) - { - flattenedGraphArray.clear(); - break; - } - } - }else -{ - - AsyncFillPatchIterator::initialSend(afpi, upper_afpi, bG, tm, ind, 0, nc, itr); - //AsyncFillPatchIterator::initialSend(m_level_afpi, m_upper_level_afpi, boxGrow, time, index, scomp, ncomp, iteration); - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(perilla::isMasterWorkerThread()) - m_level_afpi[iteration-1]->Reset(); - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - if(ntid == perilla::NUM_THREADS_PER_TEAM-2) - { - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; - while(m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->queueSize(true) != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks || - m_level_afpi[iteration-1]->destGraph->worker[tg]->computedTasks != m_level_afpi[iteration-1]->destGraph->worker[tg]->totalTasks) - { - f = m_level_afpi[iteration-1]->destGraph->getFireableRegion(tg); - if(f != -1) - { - m_level_afpi[iteration-1]->Receive(this,*_dest,boxGrow,time,index,scomp,ncomp,f,true); - m_level_afpi[iteration-1]->destGraph->setFireableRegion(f); - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->unfireableRegionQueue->queueSize(true) !=0 && - m_level_afpi[iteration-1]->destGraph->worker[tg]->fireableRegionQueue->queueSize(true) < 2) - continue; - } - - if(m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->queueSize() != 0) - { - f = m_level_afpi[iteration-1]->destGraph->worker[tg]->computedRegionQueue->removeRegion(); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(*(this),boxGrow,time+dt,index,scomp,ncomp,iteration,f,true); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,true); - } - } - m_level_afpi[iteration-1]->destGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } - } - else - { - //fout << "Calling init "<< std::endl; - //fout.close(); - init(); - } -} - } - -#endif - - using namespace perilla; - - RGIter::~RGIter() - { - //fout.close(); - } - - void RGIter::init() - { - if(itrGraph->fabTiles.size() == 0) - tiling = false; - else - tiling = true; - - int myProc = amrex::ParallelDescriptor::MyProc(); - if(implicit) - { - if(!itrGraph->isGraphEmptyV2()) - { - currentRegion = itrGraph->getPulledFireableRegion(); - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid) - break; - } - } - else - { - if(!itrGraph->isGraphEmpty()) - { - if(haveDepGraph) - currentRegion = itrGraph->getAnyFireableRegion(*depGraph); - else - currentRegion = itrGraph->getAnyFireableRegion(); - - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid) - break; - } - } - } - - //! Increment iterator to the next tile we own. - void RGIter::operator++ () - { - currentItr++; - if(tiling) - for( (currentTile == itrGraph->fabTiles[currentRegion]->numTiles ? currentTile : ++currentTile); currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - { - if(implicit) - { - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid) break; - } - else - { - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid) break; - } - } - int myProc = amrex::ParallelDescriptor::MyProc(); - if(currentItr > totalItr) - { - if(implicit) itrGraph->regionComputed(currentRegion); - else itrGraph->finalizeRegion(currentRegion); - if(implicit) - { - if(!itrGraph->isGraphEmptyV2()) - { - currentRegion = itrGraph->getPulledFireableRegion(); - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) ); - else - totalItr = 1; - - currentItr = 1; - - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - } - else - { - if(!itrGraph->isGraphEmpty()) - { - if(haveDepGraph) - currentRegion = itrGraph->getAnyFireableRegion(*depGraph); - else - currentRegion = itrGraph->getAnyFireableRegion(); - if(tiling) - totalItr = std::ceil( (1.0*itrGraph->fabTiles[currentRegion]->numTiles) / (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) ); - else - totalItr = 1; - - currentItr = 1; - currentTile = 0; - if(tiling) - for(currentTile = 0; currentTile < itrGraph->fabTiles[currentRegion]->numTiles; currentTile++) - if(currentTile % (perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS) == ntid/*-perilla::NUM_COMM_THREADS*/) - break; - } - } - } - } - - bool RGIter::isValid () - { - if(perilla::isCommunicationThread()) return false; - bool valid; - bool do_remaining = true; - - int myProc = amrex::ParallelDescriptor::MyProc(); - - if(implicit) - { - if(ntid != perilla::NUM_THREADS_PER_TEAM-1) - { - valid = !itrGraph->isGraphEmptyV2(); - if(valid) - { - do_remaining = false; - } - } - - if(do_remaining) - { - bool push = false; - int f; - int level = m_level_afpi[iteration-1]->m_amrlevel.level; - double dt = m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level); - this->currentItr = 1; - this->totalItr = 1; -#if 0 - while(!itrGraph->isGraphEmpty()) - { - f = itrGraph->worker[tg]->computedRegionQueue->getFrontRegion(true); - - if(push & level == m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel() && iteration < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level)) - m_level_afpi[iteration]->SendIntraLevel(this,boxGrow,time+dt,index,scomp,ncomp,iteration,f,false); - //else if(level == parent->finestLevel() && iteration == ncycle) - //SborderAFPI[0]->PushOnly(NUM_GROW, time+dt, State_Type, 0, NUM_STATE, f, tid, 0x02, 1); - - if(push & level < m_level_afpi[iteration-1]->m_amrlevel.parent->finestLevel()) - { - for(int i=0; i < m_level_afpi[iteration-1]->m_amrlevel.parent->nCycle(level+1); i++) - { - m_upper_level_afpi[i]->SendInterLevel(this,boxGrow,time+(i*m_level_afpi[iteration-1]->m_amrlevel.parent->dtLevel(level+1)),index,scomp,ncomp,i+1,f,false); - //upperLevel.SborderAFPI[i]->PushOnly(NUM_GROW, time+(i*parent->dtLevel(level+1)), State_Type, 0, NUM_STATE, f, tid, tuc, tempf, false); - } - } - - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); - if(perilla::isMasterWorkerThread()) - { - f = itrGraph->worker[tg]->computedRegionQueue->removeRegion(); - itrGraph->worker[tg]->completedRegionQueue->addRegion(f,true); - } - } -#endif - - //m_level_afpi[iteration-1]->destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(perilla::isMasterWorkerThread()) - { - m_level_afpi[iteration-1]->completeRegionGraphs(); -#ifdef USE_PERILLA_ON_DEMAND - pthread_mutex_lock(&teamFinLock); - Perilla::numTeamsFinished++; - pthread_mutex_unlock(&teamFinLock); -#endif - } - valid = false; - } - } - else - { - if(itrGraph->isGraphEmpty()) - { - if(perilla::isMasterWorkerThread()) - { - itrGraph->finalizeRegionGraph(); -#ifdef USE_PERILLA_ON_DEMAND - pthread_mutex_lock(&teamFinLock); - Perilla::numTeamsFinished++; - pthread_mutex_unlock(&teamFinLock); -#endif - } - } - valid = !(itrGraph->isGraphEmpty()); - } - return valid; - } - - amrex::Box RGIter::tileBox() - { - int myProc = amrex::ParallelDescriptor::MyProc(); - - if(currentTile == itrGraph->fabTiles[currentRegion]->numTiles) - { - return amrex::Box(); - } - else - { - return *(itrGraph->fabTiles[currentRegion]->tileBx[currentTile]); - } - } - - amrex::Box RGIter::validBox() const - { - return *(itrGraph->fabTiles[currentRegion]->validBx); - } - - amrex::Box RGIter::tilebox() - { - return this->tileBox(); - } - - amrex::Box RGIter::growntilebox() - { - return this->tileBox(); - } - - amrex::Box RGIter::growntilebox(int ng) - { - Box bx = this->tileBox(); - if(currentTile == itrGraph->fabTiles[currentRegion]->numTiles) - return bx; - - if (ng < -100) ng = 0; - const Box& vbx = validBox(); - for (int d=0; dtileBox(); - bx.convert(typ); - const Box& vbx = this->validBox(); - const IntVect& Big = vbx.bigEnd(); - int d0, d1; - if (dir < 0) { - d0 = 0; - d1 = BL_SPACEDIM-1; - } else { - d0 = d1 = dir; - } - for (int d=d0; d<=d1; ++d) { - if (typ.cellCentered(d)) { // validbox should also be cell-centered in d-direction. - bx.surroundingNodes(d); - if (bx.bigEnd(d) <= Big[d]) { - bx.growHi(d,-1); - } - } - } - return bx; - } - - void RGIter::sync_workers() - { - if(implicit) - itrGraph->worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); - else - itrGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } -} diff --git a/Src/AmrTask/rts_impls/runtime_common/RegionGraph.H b/Src/AmrTask/rts_impls/runtime_common/RegionGraph.H deleted file mode 100755 index d6c17f8fe65..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/RegionGraph.H +++ /dev/null @@ -1,323 +0,0 @@ -#ifndef P_REGIONGRAPH_H -#define P_REGIONGRAPH_H - -#include -#include -#include -#include -#include -#include -#include -#include - - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif - -namespace amrex{ - - class RegionGraph; - - class FabCopyAssoc - { - public: - LocalConnection l_con; - RemoteConnection r_con; - FabCopyAssoc *next; - FabCopyAssoc *prev; - RegionGraph *graphPartner; - // CopyAssoc *orig_copyAssoc; - FabCopyAssoc() - { - next=0; - prev=0; - graphPartner=0; - } - ~FabCopyAssoc() - { - if(next) delete next; - } - }; - - class pTileArray - { - public: - int numTiles; - std::vector tileBx; - Box* validBx; - pTileArray(): numTiles(0), tileBx(0){} - ~pTileArray() - { - //for(int i=0; i state; - bool init; - FabCopyAssoc *cpAsc_srcHead; - FabCopyAssoc *cpAsc_dstHead; - - std::vector depTaskIDs; - int numDepTasksCompleted; - bool depTasksCompleted; - - Task() - { - state.reserve(16); - depTaskIDs.reserve(1024); - depTasksCompleted = true; - numDepTasksCompleted = 0; - cpAsc_srcHead=0; - cpAsc_dstHead=0; - } - - ~Task() - { - state.clear(); - depTaskIDs.clear(); - if(cpAsc_srcHead != 0) - delete cpAsc_srcHead; - if(cpAsc_dstHead != 0) - delete cpAsc_dstHead; - } - }; - - class Worker - { - public: - int totalTasks; - int computedTasks; - bool init; - Barrier* barr; - Barrier* l_barr; - RegionQueue* fireableRegionQueue; - RegionQueue* unfireableRegionQueue; - RegionQueue* computedRegionQueue; - RegionQueue* completedRegionQueue; - Worker():init(false), barr(0), l_barr(0), totalTasks(0){} - - ~Worker(){ - delete barr; - delete l_barr; - delete fireableRegionQueue; - delete unfireableRegionQueue; - delete computedRegionQueue; - delete completedRegionQueue; - } - }; - - //template - class CopyMap - { - public: - std::vector map; - CopyMap *next; - CopyMap() - : - next(0) - { -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "CopyMap"); -#endif - } - - void alloc_CopyMap(const MultiFab& mf) - { - //Array IndArr = mf.IndexArray(); - const int n = mf.IndexArray().size(); - //const int n = mf.size(); - map.reserve(n); - //sMap.resize(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - map.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - ~CopyMap() - { - for (int i = 0; i < map.size(); ++i) - { - delete map[i]; - } - map.clear(); - if(next !=0 ) - delete next; -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif - } - }; - - class RegionGraph - { - public: - static int graphCnt; - int graphID; - int numTasks; - int numFabs; - int totalFinishes; - bool isDepGraph; - bool* okToReset; - pthread_mutex_t finishLock; - - std::vector fabTiles; - std::vector fabTiles_gtbx; - - std::vector lMap; - std::vector sMap; - std::vector rMap; - CopyMap *sCopyMapHead; - CopyMap *rCopyMapHead; - - std::vector task; - std::vector worker; - - RegionGraph* srcLinkGraph; - - public: - RegionGraph(int numtasks); - void Initialize(); - void Reset(); - bool isGraphEmpty(); - bool isGraphEmptyV2(); - void finalizeGraph(); - void regionGraphReset(int numfabs); - void regionGraphMinReset(void); - void enableAllRegions(); - void disableRegion(int r); - void finalizeRegion(int r); - void finalizeRegionGraph(); - void regionComputed(int r); - bool isFireableRegion(int r); - int getAnyFireableRegion(); - int getAnyFireableRegion(RegionGraph& depGraph); - int getPulledFireableRegion(); - int getFireableRegion(bool isSingleThread=false); - void setFireableRegion(int r); - void graphTeardown(); - void workerTeardown(); - int size(){return task.size();} - - int getRegion(){ - return worker[perilla::wid()]->computedRegionQueue->getFrontRegion(true); - } - - void syncComputeWorkerThreads(){ - worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - void syncComputeWorkerThreads(int numthreads){ - worker[perilla::wid()]->barr->sync(numthreads); - } - - void syncWorkerThreads(){ - worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - void syncWorkerThreads(int numthreads){ - worker[perilla::wid()]->barr->sync(numthreads); - } - - void alloc_lMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - lMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - lMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void alloc_sMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - sMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - sMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void alloc_rMap(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - rMap.reserve(n); - for (int i = 0; i < n; ++i) - { - int K = mf.IndexArray()[i]; - const Box& tmp = mf.fabbox(K); - rMap.push_back(new FArrayBox(tmp, mf.nComp(), false, true)); - } - } - - void buildTileArray(const MultiFab& mf) - { - const int n = mf.IndexArray().size(); - fabTiles.resize(n); - - for (int i = 0; i < n; ++i) - { - fabTiles[i] = new pTileArray(); - } - for (MFIter mfi(mf, true); mfi.isValid(); ++mfi) - { - fabTiles[mfi.LocalIndex()]->numTiles++; - fabTiles[mfi.LocalIndex()]->tileBx.push_back(new Box(mfi.tilebox())); - fabTiles[mfi.LocalIndex()]->validBx = new Box(mfi.validbox()); - } - } - - void buildTileArray_gtbx(const MultiFab& mf, int ng) - { - const int n = mf.IndexArray().size(); - fabTiles_gtbx.resize(n); - - for (int i = 0; i < n; ++i) - { - fabTiles_gtbx[i] = new pTileArray(); - } - for (MFIter mfi(mf, true); mfi.isValid(); ++mfi) - { - fabTiles_gtbx[mfi.LocalIndex()]->numTiles++; - fabTiles_gtbx[mfi.LocalIndex()]->tileBx.push_back(new Box(mfi.growntilebox(ng))); - } - } - - void buildTileArray(const amrex::MultiFab& mf, const amrex::IntVect& tilesize) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - const int n = mf.indexArray.size(); - fabTiles.resize(n); - - //typ = mf.boxArray().ixType(); - - for (int i = 0; i < n; ++i) - { - fabTiles[i] = new pTileArray(); - } - for (amrex::MFIter mfi(mf, tilesize); mfi.isValid(); ++mfi) - { - if( fabTiles[mfi.LocalIndex()]->numTiles == 0 ) - fabTiles[mfi.LocalIndex()]->validBx = new amrex::Box(mfi.validbox()); - fabTiles[mfi.LocalIndex()]->numTiles++; - fabTiles[mfi.LocalIndex()]->tileBx.push_back(new amrex::Box(mfi.tilebox())); - } - } - - ~RegionGraph(); - }; -}//end namespace -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/RegionGraph.cpp b/Src/AmrTask/rts_impls/runtime_common/RegionGraph.cpp deleted file mode 100755 index 0e72a1c631e..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/RegionGraph.cpp +++ /dev/null @@ -1,792 +0,0 @@ -#include -#include -#include - -using namespace std; -using namespace amrex; -using namespace perilla; - -int RegionGraph::graphCnt = 0; - -RegionGraph::RegionGraph(int numtasks) -{ - sCopyMapHead = 0; - rCopyMapHead = 0; - srcLinkGraph = 0; - isDepGraph = false; - numFabs = numtasks; - numTasks = numtasks; - graphID = ++graphCnt; - worker.resize(perilla::NUM_THREAD_TEAMS); - task.resize(numTasks); - totalFinishes=0; - okToReset = new bool[perilla::NUM_THREAD_TEAMS]; - finishLock= PTHREAD_MUTEX_INITIALIZER; - Initialize(); -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -void RegionGraph::Initialize() -{ - int numfabs = numTasks; - - int tg = WorkerThread::perilla_wid(); - for(int tg=0; tgbarr = new Barrier(perilla::NUM_THREADS_PER_TEAM - perilla::NUM_COMM_THREADS); - worker[tg]->l_barr = new Barrier(perilla::NUM_THREADS_PER_TEAM - perilla::NUM_COMM_THREADS -1); - if(numfabs <= perilla::TASK_QUEUE_DEFAULT_SIZE) - { - worker[tg]->fireableRegionQueue = new RegionQueue(); - worker[tg]->unfireableRegionQueue = new RegionQueue(); - worker[tg]->computedRegionQueue = new RegionQueue(); - worker[tg]->completedRegionQueue = new RegionQueue(); - } - else - { - worker[tg]->fireableRegionQueue = new RegionQueue(numfabs); - worker[tg]->unfireableRegionQueue = new RegionQueue(numfabs); - worker[tg]->computedRegionQueue = new RegionQueue(numfabs); - worker[tg]->completedRegionQueue = new RegionQueue(numfabs); - } - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - for(int f=0; f < numfabs; f++) - { - if(WorkerThread::isMyRegion(tg, f)) - { - task[f] = new Task(); - worker[tg]->unfireableRegionQueue->addRegion(f); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[f]->state[i] = 0; - task[f]->init = true; - } - } - worker[tg]->init = true; - okToReset[tg] = false; - } -} - -void RegionGraph::Reset() -{ - int tg= perilla::wid(); - pthread_mutex_lock(&finishLock); - if(okToReset[tg]) - totalFinishes--; - pthread_mutex_unlock(&finishLock); - - - if(okToReset[tg]) - { - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - int r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[r]->state[i] = 0; - task[r]->init = true; - if(task[r]->depTaskIDs.size() > 0) - task[r]->depTasksCompleted = false; - } - else - break; - } - } -} - -bool RegionGraph::isGraphEmpty() -{ - int tg= perilla::wid(); - worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - if(worker[tg]->completedRegionQueue->queueSize(true)== worker[tg]->totalTasks) - return true; - return false; -} - -bool RegionGraph::isGraphEmptyV2() -{ - int tg=perilla::wid(); - - if(worker[tg]->completedRegionQueue->queueSize(true) == worker[tg]->totalTasks || worker[tg]->computedTasks == worker[tg]->totalTasks) - return true; - return false; -} - -void RegionGraph::finalizeGraph() -{ - pthread_mutex_lock(&finishLock); - totalFinishes++; - int tg=perilla::wid(); - okToReset[tg]=true; - pthread_mutex_unlock(&finishLock); -} - -void RegionGraph::regionGraphReset(int numfabs) -{ - int nt; - int tg; - int r; - - { - tg = perilla::wid(); - nt = perilla::wtid(); - if(perilla::isMasterThread()) - totalFinishes=0; - if(perilla::isMasterWorkerThread()) - { - worker[tg]->totalTasks = 0; - worker[tg]->computedTasks = 0; - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - worker[tg]->totalTasks++; - for(int i=0; i<16; i++) - task[r]->state[i] = 0; - task[r]->init = true; - } - else - break; - } - okToReset[tg] = false; - } - } -} - -void RegionGraph::regionGraphMinReset(void) -{ - int nt; - int tg; - int r; - { - tg = WorkerThread::perilla_wid(); - nt = WorkerThread::perilla_wtid(); - if(perilla::isMasterThread()) - totalFinishes=0; - if(perilla::isMasterWorkerThread()) - { - while(worker[tg]->completedRegionQueue->queueSize(true) > 0) - { - r = worker[tg]->completedRegionQueue->removeRegion(true); - if(WorkerThread::isMyRegion(tg, r)) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - } - else - break; - } - okToReset[tg] = false; - } - } -} - - -void RegionGraph::enableAllRegions() -{ - int numfabs = numTasks; - int r; - int tg = WorkerThread::perilla_wid(); - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - if(perilla::isMasterWorkerThread()) - for(int f=0; funfireableRegionQueue->removeRegion(true); - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads -} - -void RegionGraph::disableRegion(int r) -{ - int tg = WorkerThread::perilla_wid(); - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rID = worker[tg]->fireableRegionQueue->removeRegion(true); - worker[tg]->unfireableRegionQueue->addRegion(rID,true); - } -} - -void RegionGraph::regionComputed(int r) -{ - int tg= perilla::wid(); - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rr = worker[tg]->fireableRegionQueue->removeRegion(); - if(r != rr) - { - std::cout << "ERROR: In computedeRegion" << std::endl; - exit(EXIT_FAILURE); - } - worker[tg]->computedRegionQueue->addRegion(rr); - worker[tg]->computedTasks++; - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); -} - -void RegionGraph::finalizeRegion(int r) -{ - int tg= perilla::wid(); - int ntid=perilla::wtid(); - worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - if(perilla::isMasterWorkerThread()) - if(WorkerThread::isMyRegion(tg, r)) - { - int rr = worker[tg]->fireableRegionQueue->removeRegion(true); - if(r != rr) - { - std::cout << "ERROR: In completeRegion" << std::endl; - exit(EXIT_FAILURE); - } - worker[tg]->completedRegionQueue->addRegion(rr,true); - } - worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads -} - -void RegionGraph::finalizeRegionGraph() -{ - int tg= perilla::wid(); - pthread_mutex_lock(&finishLock); - totalFinishes++; - okToReset[tg]=true; - pthread_mutex_unlock(&finishLock); -} - -bool RegionGraph::isFireableRegion(int r) -{ - int myProc = ParallelDescriptor::MyProc(); - FabCopyAssoc *cpDst = task[r]->cpAsc_dstHead; - if(lMap.size() > 0) - if(lMap[r]->l_con.firingRuleCnt != lMap[r]->l_con.ndcpy) - { - return false; - } - while(cpDst != 0) - { - if(cpDst->l_con.firingRuleCnt != cpDst->l_con.ndcpy) - { - return false; - } - cpDst = cpDst->next; - } - - if(srcLinkGraph != 0) - { - if(!task[r]->depTasksCompleted) - { - for(int i=0; idepTaskIDs.size(); i++){ - if(!srcLinkGraph->isFireableRegion(task[r]->depTaskIDs[i])) - return false; - } - task[r]->depTasksCompleted = true; - } - } - - if(ParallelDescriptor::NProcs() == 1) return true; - - if(lMap.size() > 0) - if(lMap[r]->r_con.firingRuleCnt != lMap[r]->r_con.nrcv) - { - return false; - } - - cpDst = task[r]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->r_con.firingRuleCnt != cpDst->r_con.nrcv) - { - return false; - } - cpDst = cpDst->next; - } - return true; -} - - -int RegionGraph::getFireableRegion(bool isSingleThread) -{ - int r = -1; - bool fireable; - int tg= perilla::wid(); - - if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0 && worker[tg]->fireableRegionQueue->queueSize() == 0) - { - fireable = false; - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - } - } - else if(worker[tg]->unfireableRegionQueue->queueSize(true)!=0) - { - int unfQsize = worker[tg]->unfireableRegionQueue->queueSize(true); - for(int i = 0; i < unfQsize; i++) - { - int tr = worker[tg]->unfireableRegionQueue->removeRegion(true); - if(isFireableRegion(tr)) - { - r = tr; - break; - } - else - worker[tg]->unfireableRegionQueue->addRegion(tr,true); - } - } - return r; -} - -void RegionGraph::setFireableRegion(int r) -{ - worker[perilla::wid()]->fireableRegionQueue->addRegion(r); -} - -int RegionGraph::getAnyFireableRegion() -{ - int myProc = ParallelDescriptor::MyProc(); - int tg = perilla::wid(); - int nt = perilla::wtid(); - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - if(nt == 0 && worker[tg]->fireableRegionQueue->queueSize()==0) - { - bool fireable = false; - int r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - return worker[tg]->fireableRegionQueue->getFrontRegion(true); -} - -int RegionGraph::getAnyFireableRegion(RegionGraph& depGraph) -{ - int nt; - int tg; - int r; - bool fireable; - - int myProc = amrex::ParallelDescriptor::MyProc(); - - tg = perilla::wid(); - nt = perilla::wtid(); - if(nt == 0 && worker[tg]->fireableRegionQueue->queueSize()==0) - { - fireable = false; - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - while(!fireable) - { - fireable = isFireableRegion(r); - fireable &= depGraph.isFireableRegion(r); - if(!fireable) - { - worker[tg]->unfireableRegionQueue->addRegion(r,true); - r = worker[tg]->unfireableRegionQueue->removeRegion(true); - } - else - worker[tg]->fireableRegionQueue->addRegion(r,true); - } - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - r = worker[tg]->fireableRegionQueue->getFrontRegion(true); - return r; -} - - - -int RegionGraph::getPulledFireableRegion() -{ - bool fireable; - int myProc = ParallelDescriptor::MyProc(); - int tg = WorkerThread::perilla_wid(); - int nt = WorkerThread::perilla_wtid(); - if(nt == 0 && worker[tg]->fireableRegionQueue->queueSize()==0) - { - while(worker[tg]->fireableRegionQueue->queueSize()==0); - } - worker[tg]->l_barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS-1); - return worker[tg]->fireableRegionQueue->getFrontRegion(true); -} - -void RegionGraph::graphTeardown() -{ -#if 0 - MPI_Status status; - Package* package; - int tg= perilla::wid(); - int numfabs = numTasks; - -#if 0 - for(int f=0; fcpAsc_dstHead; - while(cpDst != 0) - { - cpDst->l_con.firingRuleCnt = 0; - - for(int i=0; il_con.ndcpy; i++) - { - while(cpDst->l_con.dcpy[i].pQueue.queueSize() >= 1) - { - package = cpDst->l_con.dcpy[i].pQueue.dequeue(); - //package->completed = false; - //package->served = false; - //package->request = MPI_REQUEST_NULL; - cpDst->l_con.dcpy[i].recycleQueue.enqueue(package); - } - } - - cpDst = cpDst->next; - } - } - } - - - for(int f=0; fcpAsc_srcHead; - while(cpSrc != 0) - { - //cpSrc->l_con.firingRuleCnt = 0; - - for(int i=0; il_con.nscpy; i++) - { - while(cpSrc->l_con.scpy[i].pQueue.queueSize() >= 1) - { - package = cpSrc->l_con.scpy[i].pQueue.dequeue(); - - FabCopyAssoc* cpDst = cpSrc->graphPartner->task[cpSrc->l_con.scpy[i].nd]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == this) //graphArray[g]) - break; - cpDst = cpDst->next; - } - //Package* sPackage = cpSrc->l_con.scpy[i].pQueue.dequeue(true); - pthread_mutex_lock(&(cpDst->l_con.dLock)); - int dPartner = cpSrc->l_con.scpy[i].dPartner; - Package* dPackage = cpDst->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - /* - for(int j=0; jbufSize; j++) - { - dPackage->databuf[j] = sPackage->databuf[j]; - } - */ - std::memcpy(dPackage->databuf, package->databuf, dPackage->bufSize * sizeof(double)); - //std::swap(dPackage->databuf, sPackage->databuf); - - cpDst->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(cpDst->l_con.dcpy[dPartner].pQueue.queueSize(true) == 1) - cpDst->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - //cpSrc->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - - //package->completed = false; - //package->served = false; - //package->request = MPI_REQUEST_NULL; - cpSrc->l_con.scpy[i].recycleQueue.enqueue(package); - } - } - - cpSrc = cpSrc->next; - } - } - } - - - - for(int f=0; f 0) - { - lMap[f]->l_con.firingRuleCnt = 0; - } - } - } -#endif - - if(ParallelDescriptor::NProcs() == 1) return; - - -#ifndef PERILLA_USE_UPCXX - -#if 1 - if(perilla::tid()!=0) return; - for(int f=0; fcpAsc_dstHead; - while(cpDst != 0) - { - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; ir_con.nrcv; i++) - { - while(cpDst->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = cpDst->r_con.rcv[i].pQueue.dequeue(); - if(package->request != MPI_REQUEST_NULL) -{ - MPI_Cancel( &(package->request) ); -} - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - cpDst->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - cpDst = cpDst->next; - } - } - } -#endif - -#if 1 - for(int f=0; fcpAsc_srcHead; - while(cpSrc != 0) - { - //cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; ir_con.nsnd; i++) - { - while(cpSrc->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = cpSrc->r_con.snd[i].pQueue.dequeue(); - MPI_Wait( &(package->request), &status ); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - cpSrc->r_con.snd[i].recycleQueue.enqueue(package); - } - } - cpSrc = cpSrc->next; - } - } - } -#endif - - -#if 1 - if(tg == 0) - { - CopyMap* cpDst = rCopyMapHead; - while(cpDst != 0) - { - for(int f=0; fmap.size(); f++) - { - cpDst->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; imap[f]->r_con.nrcv; i++) - { - while(cpDst->map[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = cpDst->map[f]->r_con.rcv[i].pQueue.dequeue(); - if(package->request != MPI_REQUEST_NULL) - MPI_Cancel( &(package->request) ); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - cpDst->map[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - - } - - cpDst = cpDst->next; - } - - -#if 1 - CopyMap* cpSrc = sCopyMapHead; - while(cpSrc != 0) - { - for(int f=0; fmap.size(); f++) - { - for(int i=0; imap[f]->r_con.nsnd; i++) - { - while(cpSrc->map[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - - package = cpSrc->map[f]->r_con.snd[i].pQueue.dequeue(); - /* - int ns = cpSrc->map[f]->r_con.snd[i].ns; - int nd = cpSrc->map[f]->r_con.snd[i].nd; - int r_gid = cpSrc->map[f]->r_con.snd[i].r_gid; - int r_grids = cpSrc->map[f]->r_con.snd[i].r_grids; - //int tag = tagGen(ns, nd, r_gid-1, np*r_grids, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][cpSrc->map[f]->r_con.snd[i].sz]; - - Package* sPackage = lMap[f]->r_con.snd[i].pQueue.getFront(true); - package->request = ParallelDescriptor::Asend(sPackage->databuf, - cpSrc->map[f]->r_con.snd[i].sz, - cpSrc->map[f]->r_con.snd[i].pr, tag).req(); // tag == SeqNum in c++ ver - - */ - MPI_Wait( &(package->request), &status ); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - cpSrc->map[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - } - } - cpSrc = cpSrc->next; - } -#endif - } - -#endif - -//ndefupcxx -#endif - - //if(WorkerThread::isTeamMasterThread(tid)) commented out b/c its already call by single thread in a team - //Perilla::globalBarrier->sync(perilla::NUM_THREAD_TEAMS); - -#if 0 - // Parallel Copy Reset on Local tg - for(int f=0; f 0) - { - lMap[f]->r_con.firingRuleCnt = 0; - - for(int i=0; ir_con.nsnd; i++) - while(lMap[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = lMap[f]->r_con.snd[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - lMap[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - - for(int i=0; ir_con.nrcv; i++) - while(lMap[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = lMap[f]->r_con.rcv[i].pQueue.dequeue(); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - lMap[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - } - } - } - - // Fill boundary reset on local tg - if(tg == 0) - { - for(int f=0; f 0) - { - // if(WorkerThread::isMyRegion(tg,f)) - { - for(int i=0; i< rMap[f]->r_con.nrcv; i++) - while( rMap[f]->r_con.rcv[i].pQueue.queueSize() >= 1) - { - package = rMap[f]->r_con.rcv[i].pQueue.dequeue(); - if(package->request != MPI_REQUEST_NULL) - MPI_Cancel( &(package->request) ); -{ -printf("Canceling a message\n"); -} - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - rMap[f]->r_con.rcv[i].recycleQueue.enqueue(package); - } - for(int i=0; i< sMap[f]->r_con.nsnd; i++) - while( sMap[f]->r_con.snd[i].pQueue.queueSize() >= 1) - { - package = sMap[f]->r_con.snd[i].pQueue.dequeue(); - MPI_Wait( &(package->request), &status ); - package->completed = false; - package->served = false; - package->request = MPI_REQUEST_NULL; - sMap[f]->r_con.snd[i].recycleQueue.enqueue(package); - } - } - } - } - } -#endif -#endif -} - -void RegionGraph::workerTeardown() -{ - int numfabs = numTasks; - Package* package; - - regionGraphMinReset(); -} - -RegionGraph::~RegionGraph() -{ - delete[] okToReset; - for(int tg=0; tg -#include - -//////////////////////// class RegionQueue Declaration Start ///////////////////////////////////// -class RegionQueue -{ -private: - int* buffer; - int n; - int front; - int rear; - int max_size; - pthread_mutex_t queueLock; -public: - RegionQueue(); - RegionQueue(int numTasks); - ~RegionQueue(); - void addRegion(int r); - void addRegion(int r, bool canAvoidLock); - int removeRegion(); - int removeRegion(bool canAvoidLock); - int getFrontRegion(); - int getFrontRegion(bool canAvoidLock); - int queueSize(bool canAvoidLock); - int queueSize(); - void reallocate(); -}; -//////////////////////// class RegionQueue Declaration End ///////////////////////////////////// - -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/RegionQueue.cpp b/Src/AmrTask/rts_impls/runtime_common/RegionQueue.cpp deleted file mode 100755 index 894e116a120..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/RegionQueue.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include -#include - -//////////////////////// class RegionQueue Definition Start ///////////////////////////////////// -RegionQueue::RegionQueue(void) -{ - max_size= perilla::TASK_QUEUE_DEFAULT_SIZE; - buffer = new int[max_size]; - n = 0; - front = 0; - rear = 0; - queueLock=PTHREAD_MUTEX_INITIALIZER; -} - -RegionQueue::RegionQueue(int numTasks) -{ - buffer = new int[numTasks]; - n = 0; - max_size = numTasks; - front = 0; - rear = 0; - queueLock=PTHREAD_MUTEX_INITIALIZER; -} - -RegionQueue::~RegionQueue() -{ - delete[] buffer; -} - - -void RegionQueue::addRegion(int r) -{ - pthread_mutex_lock(&queueLock); - buffer[rear] = r; - rear = (rear+1)%max_size; - n++; - pthread_mutex_unlock(&queueLock); -} - -void RegionQueue::addRegion(int r, bool canAvoidLockd) -{ - if(!canAvoidLockd)pthread_mutex_lock(&queueLock); - buffer[rear] = r; - rear = (rear+1)%max_size; - n++; - if(!canAvoidLockd)pthread_mutex_unlock(&queueLock); -} - -int RegionQueue::removeRegion() -{ - int r; - pthread_mutex_lock(&queueLock); - r = buffer[front]; - front = (front+1)%max_size; - n--; - pthread_mutex_unlock(&queueLock); - return r; -} - -int RegionQueue::removeRegion(bool canAvoidLockd) -{ - int r; - if(!canAvoidLockd)pthread_mutex_lock(&queueLock); - r = buffer[front]; - front = (front+1)%max_size; - n--; - if(!canAvoidLockd)pthread_mutex_unlock(&queueLock); - return r; -} - -int RegionQueue::getFrontRegion() -{ - return buffer[front]; -} - -int RegionQueue::getFrontRegion(bool canAvoidLockd) -{ - if(!canAvoidLockd)pthread_mutex_lock(&queueLock); - return buffer[front]; - if(!canAvoidLockd)pthread_mutex_unlock(&queueLock); -} - -int RegionQueue::queueSize() -{ - int size; - pthread_mutex_lock(&queueLock); - size = n; - pthread_mutex_unlock(&queueLock); - return size; -} - -int RegionQueue::queueSize(bool canAvoidLockd) -{ - int size; - if(!canAvoidLockd)pthread_mutex_lock(&queueLock); - size = n; - if(!canAvoidLockd)pthread_mutex_unlock(&queueLock); - return size; -} -//////////////////////// class RegionQueue Definition End ///////////////////////////////////// diff --git a/Src/AmrTask/rts_impls/runtime_common/RemoteConnection.H b/Src/AmrTask/rts_impls/runtime_common/RemoteConnection.H deleted file mode 100755 index 23bc16e97f8..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/RemoteConnection.H +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef P_REMOTECONNECTION_H -#define P_REMOTECONNECTION_H - -#include -#include -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include "PerillaMemCheck.H" -extern PerillaMemCheck memcheck; -#endif - - -class RemoteCommDescriptor -{ -public: - int ns, lns; // ! Source box in layout - int nd, lnd; //! Destination box in layout - int r_gid; - int r_grids; - int sz, pr; - Box sbx; // ! Sub-box for this copy - Box dbx; // ! Sub-box for this copy - PackageQueue pQueue; // !store incoming or outgoing messages, both fab and the runtime can access this queue - PackageQueue recycleQueue; //!store used messages, only fab can access this queue, no lock is required - int cnt; - RemoteCommDescriptor() : - ns(-1), lns(-1), - nd(-1), lnd(-1), - sz(0), pr(0), - cnt(0), r_gid(0), - r_grids(0) - { -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "RemoteCommDescriptor"); -#endif - } - - ~RemoteCommDescriptor(){ -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif - } -}; - -class TransDescriptor -{ - int sz, pv, pr; -}; - -class RemoteConnection -{ -public: - int nsnd; - int nrcv; - bool remotePushReady; - bool remotePullDone; - int nrp, nsp; - pthread_mutex_t sndLock, rcvLock, ghostLock; - int firingRuleCnt; - RemoteCommDescriptor *snd; - RemoteCommDescriptor *rcv; - TransDescriptor *str; - TransDescriptor *rtr; - RemoteConnection() : - nsnd(0), - nrcv(0), - remotePushReady(false), - remotePullDone(false), - nrp(0), nsp(0), - firingRuleCnt(0), - snd(NULL), rcv(NULL), - str(NULL), rtr(NULL), - sndLock(PTHREAD_MUTEX_INITIALIZER), - rcvLock(PTHREAD_MUTEX_INITIALIZER), - ghostLock(PTHREAD_MUTEX_INITIALIZER) - { -// memcheck.add(memcheck.genKey(this), (void*)this, "RemoteCommDescriptor"); - } - - ~RemoteConnection() - { - if(snd) - delete[] snd; - if(rcv) - delete[] rcv; -// memcheck.remove(memcheck.genKey(this)); - } -}; - - -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/WorkerThread.H b/Src/AmrTask/rts_impls/runtime_common/WorkerThread.H deleted file mode 100755 index 7c20c796d4c..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/WorkerThread.H +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef P_WORKERTHREAD_H -#define P_WORKERTHREAD_H -#include -#include -#include - -namespace perilla{ - - class WorkerThread - { - //static void* team_shared_memory[perilla::NUM_THREAD_TEAMS]; - int tid; - public: - static void init(); - static Barrier* globalBarrier; - static Barrier* localBarriers[perilla::NUM_THREAD_TEAMS]; - static Barrier* localBarriers1[perilla::NUM_THREAD_TEAMS]; - static int perilla_tid(); - static int perilla_wtid(); - static int perilla_wid(); - static int perilla_nWorkerThreads(); - static int perilla_nWorkers(); - static int perilla_nTeamThreads(); - static bool perilla_isMasterWorkerThread(); - static bool perilla_isMasterThread(); - static bool perilla_isCommunicationThread(); - static bool isMyRegion(int workerID, int regionID); - static void setTeamSharedMemory(void* dummy, int tid, int tg); - static void* getTeamSharedMemory(int tg); - static void syncWorkers(); - static void syncWorkers(int tid); - static void syncThreads(); - static void syncComputeThreads(); - static void syncAllComputeThreads(); - static void syncTeamThreads(); - static void syncAllThreads(); - static void syncWorkerThreads(); - static void syncWorkerThreads(int numthreads); - static void syncComputeWorkerThreads(); - static void syncComputeWorkerThreads(int numthreads); - }; - - static int tid(){return WorkerThread::perilla_tid();} - static int wtid(){return WorkerThread::perilla_wtid();} - static int wid(){return WorkerThread::perilla_wid();} - static int nWorkerThreads(){return WorkerThread::perilla_nWorkerThreads();} - static int nWorkers(){return WorkerThread::perilla_nWorkers();} - static int nThreads(){return perilla::NUM_THREAD_TEAMS*perilla::NUM_THREADS_PER_TEAM;} - static bool isMasterWorkerThread(){return WorkerThread::perilla_isMasterWorkerThread();} - static bool isMasterThread(){return WorkerThread::perilla_isMasterThread();} - static bool isCommunicationThread(){return WorkerThread::perilla_isCommunicationThread();} - static bool isMyRegion(int workerID, int regionID){return WorkerThread::isMyRegion(workerID, regionID);} - //static void setTeamSharedMemory(void* dummy, int tid, int tg){WorkerThread::setTeamSharedMemory(dummy, tid, tg);} - //static void* getTeamSharedMemory(int tg){WorkerThread::getTeamSharedMemory(tg);} - static void syncWorkers(){WorkerThread::syncWorkers();} - static void syncWorkers(int tid){WorkerThread::syncWorkers(tid);} - static void syncThreads(){WorkerThread::syncThreads();} - static void syncComputeThreads(){WorkerThread::syncComputeThreads();} - static void syncAllComputeThreads(){WorkerThread::syncAllComputeThreads();} - static void syncComputeWorkerThreads(){WorkerThread::syncComputeWorkerThreads();} - static void syncComputeWorkerThreads(int numthreads){WorkerThread::syncComputeWorkerThreads(numthreads);} - static void syncWorkerThreads(){WorkerThread::syncWorkerThreads();} - static void syncWorkerThreads(int numthreads){WorkerThread::syncWorkerThreads(numthreads);} - static void syncAllThreads() - { - WorkerThread::syncAllThreads(); - } - static void syncAllWorkerThreads() - { - WorkerThread::syncWorkerThreads(); - WorkerThread::syncWorkers(); - } -}//end namespace - -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/WorkerThread.cpp b/Src/AmrTask/rts_impls/runtime_common/WorkerThread.cpp deleted file mode 100755 index db7aa169441..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/WorkerThread.cpp +++ /dev/null @@ -1,141 +0,0 @@ -#include -#include -#include -#include - -using namespace perilla; -//namespace perilla -//{ - //void* WorkerThread::team_shared_memory[perilla::NUM_THREAD_TEAMS]; - Barrier* WorkerThread::globalBarrier; - Barrier* WorkerThread::localBarriers[perilla::NUM_THREAD_TEAMS]; - Barrier* WorkerThread::localBarriers1[perilla::NUM_THREAD_TEAMS]; - - void WorkerThread::init(){ - WorkerThread::globalBarrier= new Barrier(perilla::NUM_THREAD_TEAMS); - for(int i=0; isync(perilla::NUM_THREAD_TEAMS); - } - - void WorkerThread::syncWorkers(int tid){ - if(perilla_wid()==tid) WorkerThread::globalBarrier->sync(perilla::NUM_THREAD_TEAMS); - } - - void WorkerThread::syncTeamThreads(){ - WorkerThread::localBarriers[perilla_wid()]->sync(perilla::NUM_THREADS_PER_TEAM); - } - - void WorkerThread::syncWorkerThreads(){ - WorkerThread::localBarriers1[perilla_wid()]->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - void WorkerThread::syncWorkerThreads(int numthreads){ - assert(numthreads== perilla::NUM_THREADS_PER_TEAM-1); - WorkerThread::localBarriers1[perilla_wid()]->sync(numthreads); - } - - -#if defined(USE_PERILLA_OMP) | defined(USE_PERILLA_ON_DEMAND) - void WorkerThread::syncAllThreads(){ - #pragma omp barrier - } -#else - void WorkerThread::syncAllThreads(){ - syncTeamThreads(); - syncWorkers(); - } -#endif - - void WorkerThread::syncAllComputeThreads(){ - syncComputeWorkerThreads(); - syncWorkers(); - } - - void WorkerThread::syncThreads(){ - syncWorkerThreads(); - syncWorkers(); - } - - void WorkerThread::syncComputeWorkerThreads(){ - WorkerThread::localBarriers1[perilla_wid()]->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - void WorkerThread::syncComputeWorkerThreads(int numthreads){ - assert(numthreads== perilla::NUM_THREADS_PER_TEAM-1); - WorkerThread::localBarriers1[perilla_wid()]->sync(numthreads); - } - -#if defined(USE_PERILLA_OMP ) | defined(USE_PERILLA_ON_DEMAND) - int WorkerThread::perilla_tid(){ - return omp_get_thread_num(); - } -#else - int WorkerThread::perilla_tid(){ - return Perilla::tid(); - } -#endif - - int WorkerThread::perilla_nTeamThreads(){ - return perilla::NUM_THREADS_PER_TEAM; - } - - int WorkerThread::perilla_nWorkerThreads(){ - return perilla::NUM_THREADS_PER_TEAM-1; - } - - int WorkerThread::perilla_nWorkers(){ - return perilla::NUM_THREAD_TEAMS; - } - - int WorkerThread::perilla_wtid() - { - int tid= perilla_tid(); - return (tid % perilla::NUM_THREADS_PER_TEAM) -1; - } - - int WorkerThread::perilla_wid() - { - int tid= perilla_tid(); - return tid / perilla::NUM_THREADS_PER_TEAM; - } - - bool WorkerThread::perilla_isMasterWorkerThread() - { - int tid= perilla_tid(); - if((tid % perilla::NUM_THREADS_PER_TEAM)==1) - return true; - else - return false; - } - - bool WorkerThread::perilla_isMasterThread(){ //pick the first one among master worker threads - return perilla_tid()==1; - } - - bool WorkerThread::perilla_isCommunicationThread() - { - int tid= perilla_tid(); - return (tid % perilla::NUM_THREADS_PER_TEAM)==0 ; - } - - bool WorkerThread::isMyRegion(int workerID, int regionID) - { - return ((regionID) % perilla::NUM_THREAD_TEAMS)==workerID; - } - -#if 0 - void WorkerThread::setTeamSharedMemory(void* dummy, int tid, int tg) - { - if((tid % perilla::NUM_THREADS_PER_TEAM)==1) - team_shared_memory[tg] = dummy; - } - - void* WorkerThread::getTeamSharedMemory(int tg) - { - return team_shared_memory[tg]; - } -#endif -//}//end namepsace diff --git a/Src/AmrTask/rts_impls/runtime_common/mylock.h b/Src/AmrTask/rts_impls/runtime_common/mylock.h deleted file mode 100644 index bddb8ed6970..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/mylock.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef MYLOCK -#define MYLOCK - -#include - -class MyLock -{ - private: - pthread_mutex_t _lock; - - public: - MyLock(){ - pthread_mutex_init(&_lock, NULL); - } - ~MyLock(){ - pthread_mutex_destroy(&_lock); - } - void lock() - { - pthread_mutex_lock(&_lock); - } - void unlock() - { - pthread_mutex_unlock(&_lock); - } -}; -#endif diff --git a/Src/AmrTask/rts_impls/runtime_common/perilla.mak b/Src/AmrTask/rts_impls/runtime_common/perilla.mak deleted file mode 100755 index 073260631ff..00000000000 --- a/Src/AmrTask/rts_impls/runtime_common/perilla.mak +++ /dev/null @@ -1,25 +0,0 @@ -CEXE_sources += Barrier.cpp -CEXE_sources += RGIter.cpp -CEXE_sources += RegionQueue.cpp -CEXE_sources += RegionGraph.cpp -CEXE_sources += WorkerThread.cpp -CEXE_sources += AsyncMultiFabUtil.cpp -CEXE_sources += PerillaMemCheck.cpp -CEXE_sources += Perilla_common.cpp - - -CEXE_headers += Barrier.H -CEXE_headers += Config.H -CEXE_headers += LocalConnection.H -CEXE_headers += PackageQueue.H -CEXE_headers += RegionGraph.H -CEXE_headers += RegionQueue.H -CEXE_headers += RegionGraph.H -CEXE_headers += RemoteConnection.H -CEXE_headers += WorkerThread.H -CEXE_headers += AsyncMultiFabUtil.H -CEXE_headers += PerillaMemCheck.H -CEXE_headers += Perilla.H - - - diff --git a/Src/AmrTask/rts_impls/upcxx/Make.package b/Src/AmrTask/rts_impls/upcxx/Make.package deleted file mode 100644 index 6490fb8ef78..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/Make.package +++ /dev/null @@ -1,13 +0,0 @@ -PERILLA_LIB=EXE - -COMMON_DIR=$(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common - -C$(PERILLA_LIB)_sources += PackageQueue.cpp Perilla.cpp PerillaRts.cpp - -C$(PERILLA_LIB)_headers += $(COMMON_DIR)/Barrier.H Config.H $(COMMON_DIR)/LocalConnection.H PackageQueue.H $(COMMON_DIR)/RegionGraph.H $(COMMON_DIR)/RGIter.H $(COMMON_DIR)/RegionQueue.H $(COMMON_DIR)/RemoteConnection.H $(COMMON_DIR)/WorkerThread.H $(COMMON_DIR)/AsyncMultiFabUtil.H PerillaRts.H - -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/runtime_common -VPATH_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/upcxx -INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/AmrTask/rts_impls/upcxx - diff --git a/Src/AmrTask/rts_impls/upcxx/PackageQueue.H b/Src/AmrTask/rts_impls/upcxx/PackageQueue.H deleted file mode 100755 index 9bcadc11d9a..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/PackageQueue.H +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef P_PACKAGEQUEUE_H -#define P_PACKAGEQUEUE_H - -#include -#include -#include -typedef upcxx::future<> CommRequest; - -class Package -{ -private: - int source, destination; -public: - upcxx::global_ptr databuf; - pthread_mutex_t packageLock; - volatile int bufSize; - volatile bool completed; //message transfer is done - volatile bool served; //message transfer request has been served but may have not completed - CommRequest *request; //!for inter-process communication - int tag; - Package(); - ~Package(); - Package(int size); - Package(int src, int dest); - Package(int src, int dest, int size); - void setPackageSource(int src); - void setPackageDestination(int dest); - void completeRequest(void); - void completeRequest(bool canAvoidLock); - bool checkRequest(void); - void generatePackage(int size); -}; - -class PackageQueue -{ -private: - Package *buffer[perilla::MSG_QUEUE_DEFAULT_MAXSIZE]; - volatile int n; - volatile int front; - volatile int rear; - volatile int prear; - int max_size; -public: - pthread_mutex_t queueLock; - PackageQueue(); - ~PackageQueue(); - int queueSize(void); - int queueSize(bool canAvoidLock); - void enqueue(Package* package); - void enqueue(Package* package, bool canAvoidLock); - Package* dequeue(void); - Package* dequeue(bool canAvoidLock); - Package* getRear(void); - Package* getRear(bool canAvoidLock); - Package* getFront(void); - Package* getFront(bool canAvoidLock); - void emptyQueue(bool canAvoidLock); -}; -#endif diff --git a/Src/AmrTask/rts_impls/upcxx/PackageQueue.cpp b/Src/AmrTask/rts_impls/upcxx/PackageQueue.cpp deleted file mode 100755 index ac34408b0d5..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/PackageQueue.cpp +++ /dev/null @@ -1,274 +0,0 @@ -#include -#include - -#ifdef PERILLA_DEBUG -#include -using namespace perilla; -extern PerillaMemCheck memcheck; -#endif - -Package::Package() -{ - databuf = nullptr; - bufSize = 0; - source = 0; - destination = 0; - completed = false; - served = false; - request = 0; - tag=0; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::~Package() -{ - if(databuf!= nullptr) - if(databuf.is_local()) - { - upcxx::delete_array(databuf); - } -#ifdef PERILLA_DEBUG - memcheck.remove(memcheck.genKey(this)); -#endif -} - -Package::Package(int size) -{ - databuf = (upcxx::global_ptr)upcxx::new_array(size); - bufSize = size; - source = 0; - destination = 0; - completed = false; - served = false; - request = 0; - tag=0; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest) -{ - databuf= nullptr; - bufSize = 0; - source = src; - destination = dest; - completed = false; - served = false; - request = 0; - tag=0; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -Package::Package(int src, int dest, int size) -{ - databuf = (upcxx::global_ptr)upcxx::new_array(size); - bufSize = size; - source = src; - destination = dest; - completed = false; - served = false; - request = 0; - tag=0; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -void Package::setPackageSource(int src) -{ - source = src; -} - -void Package::setPackageDestination(int dest) -{ - destination = dest; -} - -void Package::completeRequest(void) -{ - pthread_mutex_lock(&packageLock); - completed = true; - pthread_mutex_unlock(&packageLock); -} - -void Package::completeRequest(bool canAvoidLock) -{ - if(!canAvoidLock)pthread_mutex_lock(&packageLock); - completed = true; - if(!canAvoidLock)pthread_mutex_unlock(&packageLock); -} - -bool Package::checkRequest(void) -{ - return completed; -} - -void Package::generatePackage(int size) -{ - databuf = (upcxx::global_ptr)upcxx::new_array(size); - bufSize = size; - source = 0; - destination = 0; - completed = false; - served = false; - request = 0; - tag=0; - packageLock= PTHREAD_MUTEX_INITIALIZER; -#ifdef PERILLA_DEBUG - memcheck.add(memcheck.genKey(this), (void*)this, "Package"); -#endif -} - -PackageQueue::PackageQueue() -{ - n = 0; - front = 0; - rear = 0; - prear = -1; - max_size=perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - queueLock= PTHREAD_MUTEX_INITIALIZER;; -} - -int PackageQueue::queueSize(void) -{ - int size; - pthread_mutex_lock(&queueLock); - size = n; - pthread_mutex_unlock(&queueLock); - return size; -} - -int PackageQueue::queueSize(bool canAvoidLock) -{ - int size; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); - size = n; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return size; -} - -void PackageQueue::enqueue(Package* package) -{ - pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n==perilla::MSG_QUEUE_DEFAULT_MAXSIZE){ - printf("Failed to Enqueue: Queue Overflow\n"); - exit(0); - } -#endif - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - pthread_mutex_unlock(&queueLock); -} - -void PackageQueue::enqueue(Package* package, bool canAvoidLock) -{ - if(!canAvoidLock)pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n==perilla::MSG_QUEUE_DEFAULT_MAXSIZE){ - printf("Failed to Enqueue: Queue Overflow\n"); - exit(0); - } -#endif - buffer[rear] = package; - prear = rear; - rear = (rear+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n++; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); -} - -Package* PackageQueue::dequeue(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n<0){ - printf("Failed to Dequeue: Queue Empty\n"); - exit(0); - } -#endif - package = buffer[front]; - front = (front+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n--; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::dequeue(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); -#ifdef PERILLA_DEBUG - if(n<0){ - printf("Failed to Dequeue: Queue Empty\n"); - exit(0); - } -#endif - package = buffer[front]; - front = (front+1)%perilla::MSG_QUEUE_DEFAULT_MAXSIZE; - n--; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getRear(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); - if(n) package = buffer[prear]; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getRear(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock)pthread_mutex_lock(&queueLock); - if(n) package = buffer[prear]; - if(!canAvoidLock)pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getFront(void) -{ - Package* package = 0; - pthread_mutex_lock(&queueLock); - if(n) package = buffer[front]; - pthread_mutex_unlock(&queueLock); - return package; -} - -Package* PackageQueue::getFront(bool canAvoidLock) -{ - Package* package = 0; - if(!canAvoidLock) pthread_mutex_lock(&queueLock); - if(n) package = buffer[front]; - if(!canAvoidLock) pthread_mutex_unlock(&queueLock); - return package; -} - -void PackageQueue::emptyQueue(bool canAvoidLock){ - if(!canAvoidLock) pthread_mutex_lock(&queueLock); - while(n){ - Package* p= dequeue(true); - delete p; - } - if(!canAvoidLock) pthread_mutex_unlock(&queueLock); -} - -PackageQueue::~PackageQueue() -{ - emptyQueue(true); -} - diff --git a/Src/AmrTask/rts_impls/upcxx/Perilla.H b/Src/AmrTask/rts_impls/upcxx/Perilla.H deleted file mode 100755 index f4f17c59b51..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/Perilla.H +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef _PERILLA_ -#define _PERILLA_ - -#include -#include -#include -#include -#include - -#include - -//#define USE_PERILLA_PTHREADS - -using namespace std; - -namespace amrex{ - class Perilla - { - static int tagGen(int src, int dest, int channelID, int nFabs, int nChannels); - - public: - static int uTags; - static bool genTags; - static int max_step; - - static std::map> pTagCnt; - static std::map>>>> tagMap; - static std::map>>> myTagMap; - static void clearTagMap(); - static void clearMyTagMap(); - static void communicateTags(); - static void registerId(int tid); - static int tid(); - static volatile int numTeamsFinished; - static volatile int updateMetadata_request; - static volatile int updateMetadata_noticed; - static volatile int updateMetadata_done; - static Barrier * globalBarrier; - static void syncProcesses(); - static void multifabBuildFabCon(RegionGraph* graph, const MultiFab& mf, const Periodicity& period); - static void serviceLocalRequests(RegionGraph *graph, int tg); - static void serviceRemoteRequests(RegionGraph *graph, int graphID, int nGraphs); - static void serviceRemoteRequests(RegionGraph *graph); - //static void serviceSingleGraphComm(RegionGraph* graph, int tid); - //static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, bool cpyAcross, int tid); - static void serviceMultipleGraphCommDynamic(std::vector graphArray, bool cpyAcross, int tid); - static void flattenGraphHierarchy(std::vector >graphArray, std::vector &flattenedGraphArray); - //static void serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, int tid); - static void fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f); - static void fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT); - - void multifabExtractCopyAssoc(void* threadInfo); - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, int nc, int ng, int ngSrc, const Periodicity& period); - static void multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& dmf, const MultiFab& smf, const Periodicity& period); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPushAsync(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* dmf, MultiFab* smf, int f, bool singleT); - static void serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg); - static void serviceRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - static void resetRemoteGridCopyRequests(std::vector graphArray, int g, int nGraph, int tg); - - - - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void multifabCopyPush(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, bool singleT); - - static void multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* dmf, amrex::MultiFab* smf, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT); - static void fillBoundaryPull_1Team(RegionGraph *graph, amrex::MultiFab& mf, int f); - - static void fillBoundaryPush(amrex::RGIter& rgi, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, amrex::MultiFab& mf, bool singleT); - static void fillBoundaryPush(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf); - static void fillBoundaryPull(amrex::RGIter& rgi, RegionGraph *graph, amrex::MultiFab& mf, bool singleT); - - }; // class Perilla -} -#endif diff --git a/Src/AmrTask/rts_impls/upcxx/Perilla.cpp b/Src/AmrTask/rts_impls/upcxx/Perilla.cpp deleted file mode 100755 index 186bbd0a8f9..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/Perilla.cpp +++ /dev/null @@ -1,3036 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -using namespace amrex; -using namespace perilla; -using namespace upcxx; - - -struct sMsgMap_t{ - std::map< int, std::map< int, std::list< Package* > > > map; - volatile int size=0; - pthread_mutex_t lock= PTHREAD_MUTEX_INITIALIZER; -}sMsgMap; - -struct rMsgMap_t{ - std::map< int, std::map< int, std::list< Package* > > > map; - volatile int size=0; - pthread_mutex_t lock= PTHREAD_MUTEX_INITIALIZER; -}rMsgMap; - -struct getReq_t{ - int src; - int tag; - upcxx::global_ptr sbuf; - int size; - getReq_t(int _src, int _tag, upcxx::global_ptr _sbuf, int _size):src(_src), tag(_tag), sbuf(_sbuf), size(_size){} -}; - -struct pendingGetList_t{ - std::list< getReq_t* > _pendingGets; - pthread_mutex_t lock= PTHREAD_MUTEX_INITIALIZER; - void add(getReq_t* req){ - pthread_mutex_lock(&lock); - _pendingGets.push_back(req); - pthread_mutex_unlock(&lock); - } - void process(){ - if(_pendingGets.size()==0) return; - pthread_mutex_lock(&(rMsgMap.lock)); - pthread_mutex_lock(&lock); - std::list< getReq_t* >::iterator it= _pendingGets.begin(); - while(it != _pendingGets.end()){ - double* localbuf= NULL; - int src= (*it)->src; - int tag= (*it)->tag; - if(rMsgMap.map.find(src) != rMsgMap.map.end()){ - if(rMsgMap.map[src].find(tag) != rMsgMap.map[src].end()){ - if(rMsgMap.map[src][tag].size() >0){ - rMsgMap.map[src][tag].front()->tag= tag; - localbuf= (rMsgMap.map[src][tag].front()->databuf).local(); //(double*) (static_cast > (rMsgMap.map[src][tag].front()->databuf).local()); - *(rMsgMap.map[src][tag].front()->request)= upcxx::rget((*it)->sbuf, localbuf, (*it)->size); - rMsgMap.map[src][tag].pop_front(); - rMsgMap.size--; - std::list< getReq_t* >::iterator it1= it; - it++; - delete (*it); - _pendingGets.erase(it1); - }else it++; - }else it++; - }else it++; - } - pthread_mutex_unlock(&lock); - pthread_mutex_unlock(&(rMsgMap.lock)); - } -} pendingGetList; - - -void Perilla::syncProcesses(){ - upcxx::barrier(); -} - - -void Perilla::serviceLocalRequests(RegionGraph* rg, int tg) -{ - int numfabs = rg->lMap.size(); - for(int f=0; flMap[f]->l_con.nscpy; i++) - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize(true)>0){ - anyReq=true; - break; - } - if(anyReq) - { - pthread_mutex_lock(&(rg->lMap[f]->l_con.sLock)); - for(int i=0; ilMap[f]->l_con.nscpy; i++){ - if(rg->lMap[f]->l_con.scpy[i].pQueue.queueSize(true)>0) - { - Package *sPackage = rg->lMap[f]->l_con.scpy[i].pQueue.dequeue(true); - pthread_mutex_lock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - int dPartner = rg->lMap[f]->l_con.scpy[i].dPartner; - Package *dPackage = rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - std::memcpy(dPackage->databuf.local(), sPackage->databuf.local(), dPackage->bufSize * sizeof(double)); - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dcpy[dPartner].pQueue.queueSize(true)==1) - rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[rg->lMap[f]->l_con.scpy[i].nd]->l_con.dLock)); - rg->lMap[f]->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - } - } - pthread_mutex_unlock(&(rg->lMap[f]->l_con.sLock)); - } - }// if(tg==fg) - }// for(frMap.size(); - - for(int f=0; frMap[f]->r_con.rcvLock)); - //if(lockSucceeded != 0) - { - //if(pthread_mutex_trylock(&(rg->lMap[f]->r_con.rcvLock)) != 0) - { - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - nextrReq = true; - else - { - Package *rearPackage = rg->rMap[f]->r_con.rcv[i].pQueue.getRear(true);//!CHECK THIS POINT LATER - if(rearPackage->completed && rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - nextrReq = true; - else //!expected message is still on the way - nextrReq = false; - } - if(nextrReq) //!take a message from recycle pool and post a receive - { - pthread_mutex_lock(&(rg->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - int ns = rg->rMap[f]->r_con.rcv[i].ns; - int nd = rg->rMap[f]->r_con.rcv[i].nd; - int lnd = rg->rMap[f]->r_con.rcv[i].lnd; - int r_grids = rg->rMap[f]->r_con.rcv[i].r_grids; - //!create a package to keep track of receive requests - Package *rMetaPackage = rg->rMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - Package *rPackage = rg->lMap[f]->r_con.rcv[i].recycleQueue.dequeue(true); - int tag = tagMap[rg->rMap[f]->r_con.rcv[i].pr][graphID][nd][ns][rg->rMap[f]->r_con.rcv[i].sz]; - - rPackage->request = new future<>; - rPackage->tag = tag; - rg->lMap[f]->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - rg->rMap[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - pthread_mutex_lock(&(rMsgMap.lock)); - rMsgMap.map[rg->rMap[f]->r_con.rcv[i].pr][tag].push_back(rPackage); - rMsgMap.size++; - pthread_mutex_unlock(&(rMsgMap.lock)); - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(rg->rMap[f]->r_con.rcvLock)); - } - } - //pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - }// if(omp_test_lock) - //pthread_mutex_unlock(&(rg->rMap[f]->r_con.rcvLock)); - }// if(lockSucceeded) - }// for(fsMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //then !no message has been issued or all send requests have been fulfilled - nextsReq = false; - else - nextsReq = true; - if(nextsReq) - { - Package *sMetaPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(!sMetaPackage->served) - { - Package *sPackage = rg->lMap[f]->r_con.snd[i].pQueue.getFront(true); - sMetaPackage->completed = false; - sMetaPackage->served = true; - int ns = rg->sMap[f]->r_con.snd[i].ns; - int nd = rg->sMap[f]->r_con.snd[i].nd; - int r_gid = rg->sMap[f]->r_con.snd[i].r_gid; - int r_grids = rg->sMap[f]->r_con.snd[i].r_grids; - int tag = Perilla::myTagMap[r_gid][nd][ns][rg->sMap[f]->r_con.snd[i].sz]; - int src= upcxx::rank_me(); - //register send request so that the receiver can send back confirmation upon pull completion - sPackage->completed = false; - pthread_mutex_lock(&(sMsgMap.lock)); - sMsgMap.map[rg->sMap[f]->r_con.snd[i].pr][tag].push_back(sPackage); - sMsgMap.size++; - pthread_mutex_unlock(&(sMsgMap.lock)); - int size= rg->sMap[f]->r_con.snd[i].sz; - upcxx::global_ptr sbuf= sPackage->databuf; //static_cast >((double*)sPackage->databuf); - int dst= rg->sMap[f]->r_con.snd[i].pr; - upcxx::rpc(dst, - [=](){ - //at destination rank, look up recv buffer and pull remote data and store data in the buffer - bool posted_recv=false; - double* localbuf= NULL; - pthread_mutex_lock(&(rMsgMap.lock)); - if(rMsgMap.map.find(src) != rMsgMap.map.end()){ - if(rMsgMap.map[src].find(tag) != rMsgMap.map[src].end()) - if(rMsgMap.map[src][tag].size() >0){ - posted_recv=true; - localbuf= (rMsgMap.map[src][tag].front()->databuf).local(); //(double*) static_cast > (rMsgMap.map[src][tag].front()->databuf).local(); - *(rMsgMap.map[src][tag].front()->request)= upcxx::rget(sbuf, localbuf, size); - rMsgMap.map[src][tag].pop_front(); - rMsgMap.size--; - } - } - pthread_mutex_unlock(&(rMsgMap.lock)); - //save pull request for later when recv buffer is posted - if(posted_recv==false){ - getReq_t *req= new getReq_t(src, tag, sbuf, size); - pendingGetList.add(req); - } - } - ); - } - } - } // for(irMap[f]->r_con.nrcv; i++) - { - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - { - //if(pthread_mutex_trylock(&(rg->lMap[f]->r_con.rcvLock)) != 0) // 0-Fail, otherwise-Succeed - { - Package *rearPackage = rg->lMap[f]->r_con.rcv[i].pQueue.getRear(true); - if(!rearPackage->completed) - { - bool flag = false; - int ret_flag; - if(rearPackage->request->ready()) - { - pthread_mutex_lock(&(rg->lMap[f]->r_con.rcvLock)); - int ns = rg->lMap[f]->r_con.rcv[i].ns; - int nd = rg->lMap[f]->r_con.rcv[i].nd; - int lnd = rg->lMap[f]->r_con.rcv[i].lnd; - int r_grids = rg->lMap[f]->r_con.rcv[i].r_grids; - int tag = rearPackage->tag; - //int tag = tagMap[ rg->lMap[f]->r_con.rcv[i].pr][graphID][nd][ns][ rg->lMap[f]->r_con.rcv[i].sz]; - int dst = upcxx::rank_me(); - int src= rg->lMap[f]->r_con.rcv[i].pr; - upcxx::rpc(src, - [=](){ - pthread_mutex_lock(&(sMsgMap.lock)); - sMsgMap.map[dst][tag].front()->completed=true; - sMsgMap.map[dst][tag].pop_front(); - sMsgMap.size--; - pthread_mutex_unlock(&(sMsgMap.lock)); - } - ); - - delete rearPackage->request; - rearPackage->completeRequest(); - rg->lMap[f]->r_con.rcv[i].pQueue.getRear()->completeRequest(); - if(rg->rMap[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - rg->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - } - } - //pthread_mutex_unlock(&(rg->lMap[f]->r_con.rcvLock)); - } // if(omp_test_lock) - } // if(queueSize > 0) - } // for(ilMap[f]->r_con.nsnd; i++) - { - if(rg->sMap[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - { - Package *frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - if(frontPackage->request==0) - { - pthread_mutex_lock(&(rg->sMap[f]->r_con.sndLock)); - frontPackage = rg->sMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = 0; - frontPackage->tag = 0; - rg->sMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->sMap[f]->r_con.sndLock)); - - pthread_mutex_lock(&(rg->lMap[f]->r_con.sndLock)); - frontPackage = rg->lMap[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = 0; - frontPackage->tag = 0; - rg->lMap[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(rg->lMap[f]->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for(itotalFinishes < perilla::NUM_THREAD_TEAMS) - { - serviceLocalRequests(graph, tg); - if((np>1) & (tg==0)) - serviceRemoteRequests(graph); - } - else - { - if(tg==0) - { - while(graph->totalFinishes < perilla::NUM_THREAD_TEAMS) - { - } - //call parallel_barrier() ---???????? - ParallelDescriptor::Barrier("serviceSingleGraph-1"); - graph->graphTeardown(); - graph->workerTeardown(); - //call parallel_barrier() ------????????? - ParallelDescriptor::Barrier("serviceSingleGraph-2"); - } - break; - } - } // while(true) -} //serviceSingleGraphComm - -void Perilla::serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, bool cpyAcross, int tid) -{ - int tg = WorkerThread::perilla_wid(); - int np = ParallelDescriptor::NProcs(); - int graphFinishCnt = 0; - while(true) - { - for(int g=0; g 1) - if(tg==0) - { - serviceRemoteRequests(&graphArray[g],g,nGraphs); - //if(cpyAcross) - //serviceRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - } - } - } - //!check if we have finished all the graph execution - bool noMoreWork = true; - for(int g=0; g graphArray, bool cpyAcross, int tid) -{ - int tg = WorkerThread::perilla_wid(); - int np = ParallelDescriptor::NProcs(); - int nGraphs = graphArray.size(); - - for(int g=0; g 1)//if(tg==0) - { - if(tg==0){ - serviceRemoteRequests(graphArray[g],g,nGraphs); - if(cpyAcross) - serviceRemoteGridCopyRequests(graphArray,g,nGraphs,tg); - } - } - } -} //serviceMultipleGraphCommDynamic -#endif - -#if 0 -void Perilla::serviceMultipleGraphComm(RegionGraph graphArray[], int nGraphs, int tid) -{ - serviceMultipleGraphComm(graphArray,nGraphs,false,tid); -} // serviceMultipleGraphComm -#endif - -void Perilla::fillBoundaryPush(RegionGraph* graph, MultiFab* mf, int f) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 ntid "<< ntid <lMap[f]->l_con.sLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::PACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.nscpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQenQ f " << f << " i " << i <lMap[f]->l_con.scpy[i].recycleQueue.getFront(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->l_con.scpy[i].sbx,0,nComp,sPackage->databuf.local()); - - //for(int d=0; dbufSize; d++) - //if(sPackage->databuf[d] == 0) - //{ - //std::cout<< "in fbPush Sending 0 from f "<< f <databuf[d] != 0); - //} - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() == graph->lMap[f]->l_con.scpy[i].sbx.bigEnd()) - //if(graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(0)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(1)==7 && graph->lMap[f]->l_con.scpy[i].sbx.smallEnd(2)==4) - // std::cout<< "Corner Push for f "<< f << " data0 " <databuf[0]<< " size " <bufSize << " se "<< graph->lMap[f]->l_con.scpy[i].sbx.smallEnd() <worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - //if(graph->graphID == 1 && f == 1) - //std::cout << "fillBPush for gID 1 f 1 pQ enQ" <lMap[f]->l_con.nscpy; i++) - { - //if(graph->graphID == 1 && graph->lMap[f]->l_con.scpy[i].nd == 1) - //std::cout << "fillBPush for gID 1 nd 1 pQ enQ from f "<< f <lMap[f]->l_con.scpy[i].pQueue.enqueue( graph->lMap[f]->l_con.scpy[i].recycleQueue.dequeue(true),true ); - } - pthread_mutex_unlock(&(graph->lMap[f]->l_con.sLock)); - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - } // if(LAZY_PUSH) - else - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->r_con.sndLock)); - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nsnd; i++) - if((i%(perilla::NUM_THREADS_PER_TEAM-1))==ntid) - { - //std::cout << "RQS " << graph->lMap[f]->r_con.snd[i].recycleQueue.queueSize() << std::endl; - - Package *sndPackage = graph->lMap[f]->r_con.snd[i].recycleQueue.dequeue(true); - mf->m_fabs_v[f]->copyToMem(graph->lMap[f]->r_con.snd[i].sbx,0,nComp,sndPackage->databuf.local()); - graph->lMap[f]->r_con.snd[i].pQueue.enqueue( sndPackage,true ); - //!the local message handler will detect the change and notify the remote message handler =>read access - //!the remote message handler first modifies the front item of this queue, then it push this item back to the message pool - } - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - if(ntid==0) - { - pthread_mutex_unlock(&(graph->lMap[f]->r_con.sndLock)); - pthread_mutex_lock(&(graph->sMap[f]->r_con.sndLock)); - for(int i=0; ilMap[f]->r_con.nsnd; i++) - graph->sMap[f]->r_con.snd[i].pQueue.enqueue( graph->sMap[f]->r_con.snd[i].recycleQueue.dequeue(true),true ); - pthread_mutex_unlock(&(graph->sMap[f]->r_con.sndLock)); - } - -} // fillBoundaryPush - -void Perilla::fillBoundaryPull(RegionGraph* graph, MultiFab* mf, int f, bool singleT) -{ - - int nComp = mf->nComp(); - int tg= WorkerThread::perilla_wid(); - int ntid = WorkerThread::perilla_wtid(); - - if(ntid==0) - pthread_mutex_lock(&(graph->lMap[f]->l_con.dLock)); - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(perilla::LAZY_PUSH) - { } - else - { - if(perilla::UNPACKING_FINEGRAIN) - {} - else - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *dPackage = graph->lMap[f]->l_con.dcpy[i].pQueue.getFront(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->l_con.dcpy[i].dbx,0,nComp,dPackage->databuf.local()); - } - } // if(UNPACKING_FINEGRAIN) - else - } // if(LAZY_PUSH) - else - - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - graph->lMap[f]->l_con.dcpy[i].recycleQueue.enqueue( graph->lMap[f]->l_con.dcpy[i].pQueue.dequeue(true),true ); - - graph->lMap[f]->l_con.firingRuleCnt = graph->lMap[f]->l_con.firingRuleCnt - graph->lMap[f]->l_con.ndcpy; - - graph->lMap[f]->l_con.scpyCnt = 0; - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if(graph->lMap[f]->l_con.dcpy[i].pQueue.queueSize(true) >= 1) - graph->lMap[f]->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->l_con.dLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - int np = ParallelDescriptor::NProcs(); - if (np==1) return; - - if(ntid==0) - { - pthread_mutex_lock(&(graph->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(graph->lMap[f]->r_con.rcvLock)); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if( (i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - { - Package *rcvMetaPackage = graph->rMap[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = 0; - graph->rMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - Package *rcvPackage = graph->lMap[f]->r_con.rcv[i].pQueue.dequeue(true); - mf->m_fabs_v[f]->copyFromMem(graph->lMap[f]->r_con.rcv[i].dbx,0,nComp,rcvPackage->databuf.local()); - rcvPackage->completed = false; - graph->lMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); - } - if(!singleT) - graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); // Barrier to synchronize team threads - - if(ntid==0) - { - graph->lMap[f]->r_con.firingRuleCnt = graph->lMap[f]->r_con.firingRuleCnt - graph->lMap[f]->r_con.nrcv; - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if(graph->lMap[f]->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(graph->lMap[f]->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - graph->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(graph->rMap[f]->r_con.rcvLock)); - } - -} // fillBoundaryPull - - void Perilla::fillBoundaryPull(amrex::RGIter& rgi, RegionGraph* rg, amrex::MultiFab& mf, bool singleT) - { - if(rgi.currentItr != 1) - return; - - int f = rgi.currentRegion; - fillBoundaryPull(rg, &mf, f, singleT); - } - - void Perilla::fillBoundaryPull(amrex::RGIter& rgi, amrex::MultiFab& mf, bool singleT) - { - if(rgi.currentItr != 1) - return; - - int f = rgi.currentRegion; - fillBoundaryPull(rgi.itrGraph, &mf, f, singleT); - } - - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#if 0 -void Perilla::multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& mfDst, const MultiFab& mfSrc, int nc, int ng, int ngSrc, const Periodicity& period) -{ - // MultiFab* mfSrc = gSrc->assocMF; - // MultiFab* mfDst = gDst->assocMF; - int myProc = ParallelDescriptor::MyProc(); - int np = ParallelDescriptor::NProcs(); - - try{ - - if(true)//if(!(*mfSrc == *mfDst)) - { - if(ng > mfDst.nGrow()) cout <<"MULTIFAB_COPY_C: ng > mfDst.nGrow not supported in parallel copy"<< endl; - if(ngSrc > mfSrc.nGrow()) cout <<"MULTIFAB_COPY_C: ngSrc > mfSrc.nGrow"<< endl; - if(ngSrc > 0) - { - - // To be implemented - //do i = 1, nboxes(msrc%la) - // call push_back(bl, grow(box_nodalize(get_box(msrc%la,i),msrc%nodal),lngsrc)) - //end do - //call build(batmp, bl, sort = .false.) - //call destroy(bl) - //call build(lasrctmp, batmp, boxarray_bbox(batmp), explicit_mapping = get_proc(msrc%la)) - //call destroy(batmp) - //call build(msrctmp, lasrctmp, nc = lnc, ng = 0) - //pmfsrc => msrctmp - } - if(np > 1) - { - if(gSrc->sCopyMapHead == 0) - gSrc->sCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gSrc->sCopyMapHead; - gSrc->sCopyMapHead = tmpCopyMap; - } - if(gDst->rCopyMapHead == 0) - gDst->rCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gDst->rCopyMapHead; - gDst->rCopyMapHead = tmpCopyMap; - } - //gSrc->sCopyMapHead->map.reserve(mfSrc.size()); - //gDst->rCopyMapHead->map.reserve(mfDst.size()); - gSrc->sCopyMapHead->alloc_CopyMap(mfSrc); - gDst->rCopyMapHead->alloc_CopyMap(mfDst); - } - - if(gSrc->numTasks != mfSrc.IndexArray().size()) - std::cout<< "before " <numTasks << " now " <graphID << std::endl; - - gSrc->numFabs = mfSrc.size(); - gDst->numFabs = mfDst.size(); - - gSrc->numTasks = mfSrc.IndexArray().size(); - gDst->numTasks = mfDst.IndexArray().size(); - - int nfabsSrc = mfSrc.IndexArray().size(); - int nfabsDst = mfDst.IndexArray().size(); - - const FabArrayBase::CPC& TheCPC = mfDst.getCPC(IntVect(ng), mfSrc, IntVect(ngSrc), period); - - const int nloc_cpAsc = TheCPC.m_LocTags->size(); - const int nsnds_cpAsc = TheCPC.m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC.m_RcvTags->size(); - - Vector send_cctc; - Vector send_pr; - send_cctc.reserve(nsnds_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_SndTags->begin(), - m_End = TheCPC.m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - // std::cout<< "Loop 1" < recv_cctc; - Vector recv_pr; - recv_cctc.reserve(nrcvs_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC.m_RcvTags->begin(), - m_End = TheCPC.m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - - //std::cout<< "Before parallel at gID " << gDst->graphID << " numTask " << gDst->numTasks << " numFabs " << gDst->numFabs <graphID > 25) - //std::cout<< "Inside parallel Generating Send at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead == 0) - { - gSrc->task[f]->cpAsc_srcHead = new FabCopyAssoc(); - cpSrc = gSrc->task[f]->cpAsc_srcHead; - } - else - { - cpSrc = new FabCopyAssoc(); - cpSrc->next = gSrc->task[f]->cpAsc_srcHead; - gSrc->task[f]->cpAsc_srcHead = cpSrc; - } - - cpSrc->graphPartner = gDst; - cpSrc->l_con.nscpy = 0; - for(int i=0; il_con.nscpy++; - } - cpSrc->l_con.scpy = new LocalCopyDescriptor[cpSrc->l_con.nscpy]; - int scnt = 0; - //if(gDst->graphID == 4 && tag.dstIndex == 60 ) - //std::cout<< "Inside parallel Generating Local Copy send at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59) ) - //std::cout <<"myP " <l_con.scpy[scnt].ns = mfSrc.localindex(tag.srcIndex); - cpSrc->l_con.scpy[scnt].nd = mfDst.localindex(tag.dstIndex); - cpSrc->l_con.scpy[scnt].sbx = tag.sbox; - cpSrc->l_con.scpy[scnt].dbx = tag.dbox; - int psize = tag.sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - //std::cout<< " gSrc ID "<< gSrc->graphID << " f "<databuf.local(); //(static_cast >(tmpPkg->databuf)).local(); - assert(local_ptr!=0); - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpSrc->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.scpy[scnt].recycleQueue.enqueue(cpSrc->l_con.scpy[scnt].pQueue.dequeue()); - scnt++; - } - } - - if(np > 1) - { - cpSrc->r_con.nsnd = 0; - cpSrc->r_con.remotePushReady = false; - cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - cpSrc->r_con.nsnd++; - } - } // for(ir_con.snd = new RemoteCommDescriptor[cpSrc->r_con.nsnd]; - scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 17 && (it->srcIndex == 1198 || it->srcIndex == 1198 || it->srcIndex == 978 || it->srcIndex == 978)) - //std::cout <<"myP " <dstIndex << " s "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <r_con.snd[scnt].ns = it->srcIndex; - cpSrc->r_con.snd[scnt].nd = it->dstIndex; - cpSrc->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - cpSrc->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - cpSrc->r_con.snd[scnt].sbx = it->sbox; - cpSrc->r_con.snd[scnt].dbx = it->dbox; - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpSrc->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.snd[scnt].recycleQueue.enqueue(cpSrc->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // if(fg==tg) -//#pragma omp barrier - // std::cout<< "Barrier 1" < 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - //if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Send tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <sCopyMapHead->map[f]->r_con.nsnd = 0; - gSrc->sCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - gSrc->sCopyMapHead->map[f]->r_con.nsnd++; - } - } // for(isCopyMapHead->map[f]->r_con.snd = new RemoteCommDescriptor[gSrc->sCopyMapHead->map[f]->r_con.nsnd]; - int scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 31 && (it->dstIndex == 519)) - //std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <sCopyMapHead->map[f]->r_con.snd[scnt].ns = it->srcIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].nd = it->dstIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_gid = gDst->graphID-1; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sbx = it->sbox; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].dbx = it->dbox; - - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sz = psize; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pr = send_pr[i]; - - for(int p=0; pdatabuf[j] = 0; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; psCopyMapHead->map[f]->r_con.snd[scnt].recycleQueue.enqueue(gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead == 0) - { - gDst->task[f]->cpAsc_dstHead = new FabCopyAssoc(); - cpDst = gDst->task[f]->cpAsc_dstHead; - } - else - { - cpDst = new FabCopyAssoc(); - cpDst->next = gDst->task[f]->cpAsc_dstHead; - gDst->task[f]->cpAsc_dstHead = cpDst; - } - cpDst->graphPartner = gSrc; - cpDst->l_con.ndcpy = 0; - cpDst->l_con.firingRuleCnt = 0; - cpDst->l_con.dcpyCnt = 0; - for(int i=0; il_con.ndcpy++; - } - cpDst->l_con.dcpy = new LocalCopyDescriptor[cpDst->l_con.ndcpy]; - int dcnt = 0; - - //if(gDst->graphID > 25) - //std::cout<< "Inside parallel Generating Local copy recive at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID ==27 && f == 633) - //std::cout<< "tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc << " ndcpy " << cpDst->l_con.ndcpy <graphID == 4 && (tag.dstIndex == 60 || tag.dstIndex == 59)) - //std::cout<< "dcpy tid " << tid << " f " << f << " i " << i << " dcnt " << dcnt << " ns "<l_con.dcpy[dcnt].ns = mfSrc.localindex(tag.srcIndex); - cpDst->l_con.dcpy[dcnt].nd = mfDst.localindex(tag.dstIndex); - cpDst->l_con.dcpy[dcnt].sbx = tag.sbox; - cpDst->l_con.dcpy[dcnt].dbx = tag.dbox; - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating Package tid " << tid << " i " << i <l_con.dcpy[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf.local();//(static_cast >(tmpPkg->databuf)).local(); - - // if(tmpPkg == nullptr) - //std::cout<<"Found the culprit tid " << tid << " f " << f << " i " << i << std::endl; - - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpDst->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating now in reQ Package tid " << tid << " i " << i <l_con.dcpy[dcnt].recycleQueue.enqueue(cpDst->l_con.dcpy[dcnt].pQueue.dequeue()); - - //if(gDst->graphID > 25 && f == 633) - // std::cout<< " Generated Package tid " << tid << " i " << i <graphID > 25 && f > 630) - //std::cout<< "Safe now tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc <srcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; il_con.ndcpy ; i++) - { - for(int p=0; pl_con.dcpy[i].sz; - Package *tmpPkg = new Package(psize); - void* local_ptr= tmpPkg->databuf.local(); //(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpdDst->l_con.dcpy[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.dcpy[i].recycleQueue.enqueue(cpdDst->l_con.dcpy[i].pQueue.dequeue()); - } - } - - if(np > 1) - { - cpDst->r_con.nrcv = 0; - cpDst->r_con.remotePullDone = false; - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - cpDst->r_con.nrcv++; - } - } // for(ir_con.rcv = new RemoteCommDescriptor[cpDst->r_con.nrcv]; - dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - cpDst->r_con.rcv[dcnt].nd = it->dstIndex; - cpDst->r_con.rcv[dcnt].ns = it->srcIndex; - cpDst->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - cpDst->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - cpDst->r_con.rcv[dcnt].sbx = it->sbox; - cpDst->r_con.rcv[dcnt].dbx = it->dbox; - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - cpDst->r_con.rcv[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf.local(); //(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpDst->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[dcnt].recycleQueue.enqueue(cpDst->r_con.rcv[dcnt].pQueue.dequeue()); - } - - dcnt++; - } - } - }// for(isrcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; ir_con.nrcv ; i++) - { - for(int p=0; pr_con.rcv[i].sz; - Package *tmpPkg = new Package(psize); - void* local_ptr= tmpPkg->databuf.local(); //(static_cast >(tmpPkg->databuf)).local(); - for(int j=0; jdatabuf[j] = 0; - ((double*)local_ptr)[j]=0; - } - cpdDst->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[i].recycleQueue.enqueue(cpdDst->r_con.rcv[i].pQueue.dequeue()); - } - } - } // if(np > 1) - }// if(fg==tg) - -//#pragma omp barrier - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - //if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Recive tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <rCopyMapHead->map[f]->r_con.nrcv = 0; - gDst->rCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - gDst->rCopyMapHead->map[f]->r_con.nrcv++; - } - } - gDst->rCopyMapHead->map[f]->r_con.rcv = new RemoteCommDescriptor[gDst->rCopyMapHead->map[f]->r_con.nrcv]; - int dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - - // if(myProc==54 && gDst->graphID == 25 && f == 10) - // std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " sgID "<< gSrc->graphID <<" tg "<rCopyMapHead->map[f]->r_con.rcv[dcnt].nd = it->dstIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].ns = it->srcIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sbx = it->sbox; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].dbx = it->dbox; - - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sz = psize; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pr = recv_pr[i]; - - BL_ASSERT(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd == f); - - if(Perilla::genTags) - { - try{ - std::map::iterator itr = tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].find(psize); - if( itr != tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].end()) - { - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex][psize] = Perilla::uTags++; - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[recv_pr[i]].find(gDst->graphID-1); - if(itr2 != pTagCnt[recv_pr[i]].end()) - pTagCnt[recv_pr[i]][gDst->graphID-1] = pTagCnt[recv_pr[i]][gDst->graphID-1] + 1; - else - pTagCnt[recv_pr[i]][gDst->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< gDst->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[recv_pr[i]][gDst->graphID][it->dstIndex][it->srcIndex] = pTagCnt[recv_pr[i]]; - - - for(int p=0; pdatabuf[j] = 0; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; prCopyMapHead->map[f]->r_con.rcv[dcnt].recycleQueue.enqueue(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.dequeue()); - dcnt++; - } - } - } // for(i 1) - //} //if(fg==tg) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Send partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead->l_con.nscpy; i++) - { - int lnd = gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].nd; - for(int j=0; jtask[ lnd ]->cpAsc_dstHead->l_con.ndcpy; j++) - if(gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dbx == gDst->task[ lnd ]->cpAsc_dstHead->l_con.dcpy[j].dbx) - gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dPartner = j; - } - } - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead->l_con.ndcpy; i++) - { - int lns = gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].ns; - for(int j=0; jtask[ lns ]->cpAsc_srcHead->l_con.nscpy; j++) - if(gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].dbx == gSrc->task[ lns ]->cpAsc_srcHead->l_con.scpy[j].dbx) - gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].sPartner = j; - } - } - } // for(fgraphID <<" "<< e.what() << '\n'; -} - - -//std::cout<< "All done safely at gID " << gDst->graphID < send_cctc; -Array send_pr; -Array recv_cctc; -Array recv_pr; - - -void Perilla::multifabExtractCopyAssoc(RegionGraph* gDst, RegionGraph* gSrc, const MultiFab& mfDst, const MultiFab& mfSrc, int nc, int ng, int ngSrc, const Periodicity& period) -{ -#if 1 - int myProc = ParallelDescriptor::MyProc(); - int np = ParallelDescriptor::NProcs(); - try{ - if(true)//if(!(*mfSrc == *mfDst)) - { -#ifdef USE_PERILLA_PTHREADS - // if(perilla::isMasterThread()) -#endif - { - if(ng > mfDst.nGrow()) cout <<"MULTIFAB_COPY_C: ng > mfDst.nGrow not supported in parallel copy"<< endl; - if(ngSrc > mfSrc.nGrow()) cout <<"MULTIFAB_COPY_C: ngSrc > mfSrc.nGrow"<< endl; - if(ngSrc > 0) - { - // To be implemented - //do i = 1, nboxes(msrc%la) - // call push_back(bl, grow(box_nodalize(get_box(msrc%la,i),msrc%nodal),lngsrc)) - //end do - //call build(batmp, bl, sort = .false.) - //call destroy(bl) - //call build(lasrctmp, batmp, boxarray_bbox(batmp), explicit_mapping = get_proc(msrc%la)) - //call destroy(batmp) - //call build(msrctmp, lasrctmp, nc = lnc, ng = 0) - //pmfsrc => msrctmp - assert(false); - } - if(np > 1) - { - if(gSrc->sCopyMapHead == 0) - gSrc->sCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gSrc->sCopyMapHead; - gSrc->sCopyMapHead = tmpCopyMap; - } - if(gDst->rCopyMapHead == 0) - gDst->rCopyMapHead = new CopyMap(); - else - { - CopyMap *tmpCopyMap = new CopyMap(); - tmpCopyMap->next = gDst->rCopyMapHead; - gDst->rCopyMapHead = tmpCopyMap; - } - //gSrc->sCopyMapHead->map.reserve(mfSrc.size()); - //gDst->rCopyMapHead->map.reserve(mfDst.size()); - gSrc->sCopyMapHead->alloc_CopyMap(mfSrc); - gDst->rCopyMapHead->alloc_CopyMap(mfDst); - } - - //if(gSrc->numTasks != mfSrc.IndexArray().size()) - // std::cout<< "before " <numTasks << " now " <graphID << std::endl; - - gSrc->numFabs = mfSrc.size(); - gDst->numFabs = mfDst.size(); - gSrc->numTasks = mfSrc.IndexArray().size(); - gDst->numTasks = mfDst.IndexArray().size(); - } -#ifdef USE_PERILLA_PTHREADS - // perilla::syncAllThreads(); -#endif - const FabArrayBase::CPC *TheCPC= &mfDst.getCPC(ng, mfSrc, ngSrc, period);; - - int nfabsSrc = mfSrc.IndexArray().size(); - int nfabsDst = mfDst.IndexArray().size(); - - const int nloc_cpAsc = TheCPC->m_LocTags->size(); - const int nsnds_cpAsc = TheCPC->m_SndTags->size(); - const int nrcvs_cpAsc = TheCPC->m_RcvTags->size(); -#ifdef USE_PERILLA_PTHREADS - // perilla::syncAllThreads(); -#endif - - if(np > 1){ -#ifdef USE_PERILLA_PTHREADS - // if(perilla::isMasterThread()) -#endif - { - send_cctc.reserve(nsnds_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC->m_SndTags->begin(), - m_End = TheCPC->m_SndTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // Not destined to me. - { - send_pr.push_back(m_it->first); - send_cctc.push_back(&(m_it->second)); - } - } - - recv_cctc.reserve(nrcvs_cpAsc); - - for (FabArrayBase::MapOfCopyComTagContainers::const_iterator m_it = TheCPC->m_RcvTags->begin(), - m_End = TheCPC->m_RcvTags->end(); - m_it != m_End; - ++m_it) - { - if(m_it->first != myProc) // I am not the source for this receipt - { - recv_pr.push_back(m_it->first); - recv_cctc.push_back(&(m_it->second)); - } - } - } - } -#ifdef USE_PERILLA_PTHREADS - // perilla::syncAllThreads(); -#endif - - //#ifndef USE_PERILLA_PTHREADS -#pragma omp parallel shared(gSrc, gDst, mfSrc, mfDst, nfabsSrc, nfabsDst) - //#endif - { - int tid = omp_get_thread_num();//perilla::tid();//omp_get_thread_num(); - int tg = tid/perilla::NUM_THREADS_PER_TEAM;//perilla::wid();//WorkerThread::perilla_wid(); - int nt= tid%perilla::NUM_THREADS_PER_TEAM; - int fg; - //std::cout<<"thread "<< tid<<"group "<graphID << " numTask " << gDst->numTasks << " numFabs " << gDst->numFabs <graphID > 25) - //std::cout<< "Inside parallel Generating Send at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead == 0) - { - gSrc->task[f]->cpAsc_srcHead = new FabCopyAssoc(); - cpSrc = gSrc->task[f]->cpAsc_srcHead; - } - else - { - cpSrc = new FabCopyAssoc(); - cpSrc->next = gSrc->task[f]->cpAsc_srcHead; - gSrc->task[f]->cpAsc_srcHead = cpSrc; - } - - cpSrc->graphPartner = gDst; - cpSrc->l_con.nscpy = 0; - for(int i=0; im_LocTags)[i]; - //if(f == tag.srcIndex) - if(mfSrc.IndexArray()[f] == tag.srcIndex) - cpSrc->l_con.nscpy++; - } - cpSrc->l_con.scpy = new LocalCopyDescriptor[cpSrc->l_con.nscpy]; - - //if(gDst->graphID == 4 && tag.dstIndex == 60 ) - //std::cout<< "Inside parallel Generating Local Copy send at tid " << tid << " f " << f << " gID " << gDst->graphID <<" num local connections"<< nloc_cpAsc << std::endl; - - for(int i=0; im_LocTags)[i]; - //if(f == tag.srcIndex) - if(mfSrc.IndexArray()[f] == tag->srcIndex) - { - cpSrc->l_con.scpy[scnt].ns = mfSrc.localindex(tag->srcIndex); - cpSrc->l_con.scpy[scnt].nd = mfDst.localindex(tag->dstIndex); - cpSrc->l_con.scpy[scnt].sbx = tag->sbox; - cpSrc->l_con.scpy[scnt].dbx = tag->dbox; - int psize = tag->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - //std::cout<< " gSrc ID "<< gSrc->graphID << " f "<databuf[j] = 0; - cpSrc->l_con.scpy[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.scpy[scnt].recycleQueue.enqueue(cpSrc->l_con.scpy[scnt].pQueue.dequeue()); - scnt++; - } - } - - if(np > 1) - { - cpSrc->r_con.nsnd = 0; - cpSrc->r_con.remotePushReady = false; - cpSrc->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - cpSrc->r_con.nsnd++; - } - } // for(ir_con.snd = new RemoteCommDescriptor[cpSrc->r_con.nsnd]; - scnt = 0; - for(int i=0; isrcIndex) - { - cpSrc->r_con.snd[scnt].ns = it->srcIndex; - cpSrc->r_con.snd[scnt].nd = it->dstIndex; - cpSrc->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - cpSrc->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - cpSrc->r_con.snd[scnt].sbx = it->sbox; - cpSrc->r_con.snd[scnt].dbx = it->dbox; - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - for(int p=0; pdatabuf[j] = 0; - cpSrc->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.snd[scnt].recycleQueue.enqueue(cpSrc->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // if(fg==tg) - - //perilla::syncAllThreads(); -#pragma omp barrier - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - if(tid==0) - { - - // std::cout<< "Inside parallel Generating Remote Send tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <sCopyMapHead->map[f]->r_con.nsnd = 0; - gSrc->sCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; isrcIndex) - gSrc->sCopyMapHead->map[f]->r_con.nsnd++; - } - } // for(isCopyMapHead->map[f]->r_con.snd = new RemoteCommDescriptor[gSrc->sCopyMapHead->map[f]->r_con.nsnd]; - int scnt = 0; - for(int i=0; isrcIndex) - { - - //if(gDst->graphID == 31 && (it->dstIndex == 519)) - //std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " i "<< scnt << " tg " <sCopyMapHead->map[f]->r_con.snd[scnt].ns = it->srcIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].nd = it->dstIndex; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_gid = gDst->graphID-1; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lns = mfSrc.localindex(it->srcIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].lnd = mfDst.localindex(it->dstIndex); - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sbx = it->sbox; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].dbx = it->dbox; - - int psize = it->sbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].sz = psize; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pr = send_pr[i]; - - for(int p=0; pdatabuf[j] = 0; - gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; psCopyMapHead->map[f]->r_con.snd[scnt].recycleQueue.enqueue(gSrc->sCopyMapHead->map[f]->r_con.snd[scnt].pQueue.dequeue()); - scnt++; - } - } - } // for(i 1) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead == 0) - { - gDst->task[f]->cpAsc_dstHead = new FabCopyAssoc(); - cpDst = gDst->task[f]->cpAsc_dstHead; - } - else - { - cpDst = new FabCopyAssoc(); - cpDst->next = gDst->task[f]->cpAsc_dstHead; - gDst->task[f]->cpAsc_dstHead = cpDst; - } - cpDst->graphPartner = gSrc; - cpDst->l_con.ndcpy = 0; - cpDst->l_con.firingRuleCnt = 0; - cpDst->l_con.dcpyCnt = 0; - for(int i=0; im_LocTags)[i]; - //if(f == tag.dstIndex) - if(mfDst.IndexArray()[f] == tag->dstIndex) - cpDst->l_con.ndcpy++; - } - cpDst->l_con.dcpy = new LocalCopyDescriptor[cpDst->l_con.ndcpy]; - int dcnt = 0; - - //if(gDst->graphID > 25) - //std::cout<< "Inside parallel Generating Local copy recive at tid " << tid << " f " << f << " gID " << gDst->graphID <graphID ==27 && f == 633) - //std::cout<< "tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc << " ndcpy " << cpDst->l_con.ndcpy <m_LocTags)[i]; - //if(f == tag->dstIndex) - if(mfDst.IndexArray()[f] == tag->dstIndex) - { - - //if(gDst->graphID == 4 && (tag->dstIndex == 60 || tag->dstIndex == 59)) - //std::cout<< "dcpy tid " << tid << " f " << f << " i " << i << " dcnt " << dcnt << " ns "<srcIndex << " nd "<dstIndex << " lo " << tag->dbox.smallEnd() << " hi " << tag->dbox.bigEnd() <l_con.dcpy[dcnt].ns = mfSrc.localindex(tag->srcIndex); - cpDst->l_con.dcpy[dcnt].nd = mfDst.localindex(tag->dstIndex); - cpDst->l_con.dcpy[dcnt].sbx = tag->sbox; - cpDst->l_con.dcpy[dcnt].dbx = tag->dbox; - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating Package tid " << tid << " i " << i <dbox.numPts() * mfSrc.nComp(); //---------------------------------------------------------------???????????????? - cpDst->l_con.dcpy[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->l_con.dcpy[dcnt].pQueue.enqueue(tmpPkg); - } - - // if(gDst->graphID > 25 && f == 633) - //std::cout<< " Generating now in reQ Package tid " << tid << " i " << i <l_con.dcpy[dcnt].recycleQueue.enqueue(cpDst->l_con.dcpy[dcnt].pQueue.dequeue()); - - //if(gDst->graphID > 25 && f == 633) - // std::cout<< " Generated Package tid " << tid << " i " << i <graphID > 25 && f > 630) - //std::cout<< "Safe now tid " << tid << " f " << f << " gID " << gDst->graphID << " numReciv " << nloc_cpAsc <srcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; il_con.ndcpy ; i++) - { - for(int p=0; pl_con.dcpy[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->l_con.dcpy[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pl_con.dcpy[i].recycleQueue.enqueue(cpdDst->l_con.dcpy[i].pQueue.dequeue()); - } - } - - if(np > 1) - { - cpDst->r_con.nrcv = 0; - cpDst->r_con.remotePullDone = false; - cpDst->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - cpDst->r_con.nrcv++; - } - } // for(ir_con.rcv = new RemoteCommDescriptor[cpDst->r_con.nrcv]; - dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - cpDst->r_con.rcv[dcnt].nd = it->dstIndex; - cpDst->r_con.rcv[dcnt].ns = it->srcIndex; - cpDst->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - cpDst->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - cpDst->r_con.rcv[dcnt].sbx = it->sbox; - cpDst->r_con.rcv[dcnt].dbx = it->dbox; - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - cpDst->r_con.rcv[dcnt].sz = psize; - - if(!gDst->isDepGraph) - { - for(int p=0; pdatabuf[j] = 0; - cpDst->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[dcnt].recycleQueue.enqueue(cpDst->r_con.rcv[dcnt].pQueue.dequeue()); - } - - dcnt++; - } - } - }// for(isrcLinkGraph; - for(int df=0; df < gDst->task[f]->depTaskIDs.size(); df++) - { - int dfi = gDst->task[f]->depTaskIDs[df]; - FabCopyAssoc *cpdDst = depGraph->task[dfi]->cpAsc_dstHead; - for(int i=0; ir_con.nrcv ; i++) - { - for(int p=0; pr_con.rcv[i].sz; - Package *tmpPkg = new Package(psize); - for(int j=0; jdatabuf[j] = 0; - cpdDst->r_con.rcv[i].pQueue.enqueue(tmpPkg); - } - for(int p=0; pr_con.rcv[i].recycleQueue.enqueue(cpdDst->r_con.rcv[i].pQueue.dequeue()); - } - } - - - } // if(np > 1) - }// if(fg==tg) - - //perilla::syncAllThreads(); -#pragma omp barrier - - if(np > 1) - { - //if(WorkerThread::perilla_isMasterWorkerThread() && tg==0) - if(tid==0) - { - // std::cout<< "Inside parallel Generating Remote Recive tg 0 at tid " << tid << " f " << f << " gID " << gDst->graphID <rCopyMapHead->map[f]->r_con.nrcv = 0; - gDst->rCopyMapHead->map[f]->r_con.firingRuleCnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - gDst->rCopyMapHead->map[f]->r_con.nrcv++; - } - } - gDst->rCopyMapHead->map[f]->r_con.rcv = new RemoteCommDescriptor[gDst->rCopyMapHead->map[f]->r_con.nrcv]; - int dcnt = 0; - for(int i=0; idstIndex) - if(mfDst.IndexArray()[f] == it->dstIndex) - { - - // if(myProc==54 && gDst->graphID == 25 && f == 10) - // std::cout <<"myP " <dstIndex << " ns "<< it->srcIndex << " f " << f << " sgID "<< gSrc->graphID <<" tg "<rCopyMapHead->map[f]->r_con.rcv[dcnt].nd = it->dstIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].ns = it->srcIndex; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = mfDst.localindex(it->dstIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lns = mfSrc.localindex(it->srcIndex); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].r_grids = (gDst->numFabs > gSrc->numFabs ? gDst->numFabs : gSrc->numFabs); - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sbx = it->sbox; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].dbx = it->dbox; - - int psize = it->dbox.numPts() * mfDst.nComp(); //---------------------------------------------------------------???????????????? - - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].sz = psize; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pr = recv_pr[i]; - - BL_ASSERT(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd == f); - - if(Perilla::genTags) - { - try{ - std::map::iterator itr = tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].find(psize); - if( itr != tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex].end()) - { - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = itr->second; - } - else - { - tagMap[recv_pr[i]][gDst->graphID-1][it->dstIndex][it->srcIndex][psize] = Perilla::uTags++; - //gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].lnd = Perilla::uTags++; - std::map::iterator itr2 = pTagCnt[recv_pr[i]].find(gDst->graphID-1); - if(itr2 != pTagCnt[recv_pr[i]].end()) - pTagCnt[recv_pr[i]][gDst->graphID-1] = pTagCnt[recv_pr[i]][gDst->graphID-1] + 1; - else - pTagCnt[recv_pr[i]][gDst->graphID-1] = 1; - } - } - catch(std::exception& e) - { - std::cout <<"Inside tagGeneration gID "<< gDst->graphID <<" "<< e.what() << '\n'; - } - } - //tagMap[recv_pr[i]][gDst->graphID][it->dstIndex][it->srcIndex] = pTagCnt[recv_pr[i]]; - - - for(int p=0; pdatabuf[j] = 0; - gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.enqueue(tmpPkg); - } - for(int p=0; prCopyMapHead->map[f]->r_con.rcv[dcnt].recycleQueue.enqueue(gDst->rCopyMapHead->map[f]->r_con.rcv[dcnt].pQueue.dequeue()); - dcnt++; - } - } - } // for(i 1) - //} //if(fg==tg) - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Send partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_srcHead->l_con.nscpy; i++) - { - int lnd = gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].nd; - for(int j=0; jtask[ lnd ]->cpAsc_dstHead->l_con.ndcpy; j++) - if(gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dbx == gDst->task[ lnd ]->cpAsc_dstHead->l_con.dcpy[j].dbx) - gSrc->task[f]->cpAsc_srcHead->l_con.scpy[i].dPartner = j; - } - } - } // for(fgraphID > 25) - //std::cout<< "Inside parallel Generating Recive partners at tid " << tid << " f " << f << " gID " << gDst->graphID <task[f]->cpAsc_dstHead->l_con.ndcpy; i++) - { - int lns = gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].ns; - for(int j=0; jtask[ lns ]->cpAsc_srcHead->l_con.nscpy; j++) - if(gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].dbx == gSrc->task[ lns ]->cpAsc_srcHead->l_con.scpy[j].dbx) - gDst->task[f]->cpAsc_dstHead->l_con.dcpy[i].sPartner = j; - } - } - } // for(fgraphID <<" "<< e.what() << '\n'; -} - - -//std::cout<< "All done safely at gID " << gDst->graphID <assocMF; - // MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - //if(srcGraph->graphID==18 && f ==316 && ntid == 0) - //std::cout << "srgG chk see " << srcGraph << " " <graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf.local()); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - //if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf.local()); - } - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - //if(ntid==0) - //{ - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - //if(myProc==26 && srcGraph->graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && f ==316) - //BL_ASSERT(cpSrc->r_con.nsnd == 177); - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf.local()); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - //if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - for(int i=0; ir_con.nsnd; i++) - { - - // if(myProc==4 && srcGraph->graphID==2 && (f ==0 || f ==2)) - //std::cout << " Pushing 2 316 164"<r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf.local()); - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - - } - - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - //if(ntid==0) - //{ - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - cpSrc->r_con.remotePushReady = true; - ///* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - //*/ - } - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) -} // multifabCopyPushAsync - - -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) -{ - int myProc = ParallelDescriptor::MyProc(); - - int ntid = WorkerThread::perilla_wtid(); - int tg = WorkerThread::perilla_wid(); - //MultiFab* mfDst = destGraph->assocMF; - //MultiFab* mfSrc = srcGraph->assocMF; - if(nc<1) cout <<"MULTIFAB_COPY_C: nc must be >= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - //if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - //if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpDst = destGraph->task[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == srcGraph) - break; - cpDst = cpDst->next; - } - if(cpDst == 0) cout <<"Metadata for across grid copy not found"<< endl; - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - if(singleT) - { - pthread_mutex_lock(&(cpDst->l_con.dLock)); - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf.local()); - } - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - else - { - if(ntid==0) - { - pthread_mutex_lock(&(cpDst->l_con.dLock)); - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - //if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - for(int i=0; il_con.ndcpy; i++) - { - Package* rcvPackage = cpDst->l_con.dcpy[i].pQueue.getFront(true); // corrected from recycleQ to pQ - mfDst->m_fabs_v[f]->copyFromMem(cpDst->l_con.dcpy[i].dbx,dstcomp,nc,rcvPackage->databuf.local()); - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - //if(ntid == 0) - //{ - for(int i=0; il_con.ndcpy; i++) - cpDst->l_con.dcpy[i].recycleQueue.enqueue(cpDst->l_con.dcpy[i].pQueue.dequeue(true),true); // corrected from pQ to recycleQ and from recycleQ to pQ - cpDst->l_con.firingRuleCnt = cpDst->l_con.firingRuleCnt - cpDst->l_con.ndcpy; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - - int np = ParallelDescriptor::NProcs(); - if(np == 1) - return; - - if(singleT) - { - //pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - for(int i=0; ir_con.nrcv; i++) - { - ///* - //Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf.local()); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = 0; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); - //destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - //rcvPackage->completed = false; - //cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - } - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - //pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - - } - else - { - if(ntid==0) - { - //pthread_mutex_lock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - //} - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - //if((i%(perilla::NUM_THREADS_PER_TEAM-1)) == ntid) - for(int i=0; ir_con.nrcv; i++) - { - ///* - //Package *rcvMetaPackage = destGraph->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); - mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf.local()); - rcvPackage->completed = false; - rcvPackage->served = false; - rcvPackage->request = 0; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage, true); - //destGraph->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - //rcvPackage->completed = false; - //cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - //*/ - - //Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.getFront(true); // corrected from recycleQ to pQ - //mfDst->m_fabs_v[f]->copyFromMem(cpDst->r_con.rcv[i].dbx,dstcomp,nc,rcvPackage->databuf); - - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - - //if(ntid==0) - //{ - cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - cpDst->r_con.nrcv; - - cpDst->r_con.remotePullDone = true; - ///* - for(int i=0; ir_con.nrcv; i++) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - //*/ - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - //pthread_mutex_unlock(&(destGraph->rCopyMapHead->map[f]->r_con.rcvLock)); - } - //destGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-1); - } - } // if(!(*mfDst == *mfSrc)) - -} // multifabCopyPull - -void Perilla::multifabCopyPull(RegionGraph* destGraph, RegionGraph* srcGraph, MultiFab* mfDst, MultiFab* mfSrc, int f, bool singleT) -{ - multifabCopyPull(destGraph, srcGraph, mfDst, mfSrc, f, 1, 1, 1, 0, 0,singleT); -} - - -void Perilla::serviceLocalGridCopyRequests(std::vector graphArray, int g, int tg) -{ - int nfabs = graphArray[g]->numTasks; - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - bool anyReq=false; - for(int i=0; il_con.nscpy; i++) - if(cpSrc->l_con.scpy[i].pQueue.queueSize(true)>0){ - anyReq=true; - break; - } - if(anyReq) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - if(cpSrc->l_con.scpy[i].pQueue.queueSize(true)>0) - { - FabCopyAssoc* cpDst = cpSrc->graphPartner->task[cpSrc->l_con.scpy[i].nd]->cpAsc_dstHead; - while(cpDst != 0) - { - if(cpDst->graphPartner == graphArray[g]) - break; - cpDst = cpDst->next; - } - Package* sPackage = cpSrc->l_con.scpy[i].pQueue.dequeue(true); - pthread_mutex_lock(&(cpDst->l_con.dLock)); - int dPartner = cpSrc->l_con.scpy[i].dPartner; - Package* dPackage = cpDst->l_con.dcpy[dPartner].recycleQueue.dequeue(true); - std::memcpy(dPackage->databuf.local(), sPackage->databuf.local(), dPackage->bufSize * sizeof(double)); - cpDst->l_con.dcpy[dPartner].pQueue.enqueue(dPackage,true); - if(cpDst->l_con.dcpy[dPartner].pQueue.queueSize(true) == 1) - cpDst->l_con.firingRuleCnt++; - pthread_mutex_unlock(&(cpDst->l_con.dLock)); - cpSrc->l_con.scpy[i].recycleQueue.enqueue(sPackage,true); - } - } // for - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - }//anyReq - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // if(tg==fg) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - bool nextsReq, nextrReq; - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - //if(pthread_mutex_trylock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)) != 0) - { - //if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - if(cpDst->r_con.rcv[i].pQueue.queueSize(true)==0) - { - nextrReq = true; - } - else - { - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true); - // Also check the recycle queue because when rear is completed it may cause unlimited recv posts - if(rearPackage->completed && cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) //!latest receive request has been completed - { - nextrReq = true; - } - else //!expected message is still on the way - nextrReq = false; - } - if(nextrReq) //!take a message from recycle pool and post a receive - { - //!create a package to keep track of receive requests - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - //Package *rMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.dequeue(true); - //!extract a package from the recycle pool at the destination NUMA node to buffer incoming data - int ns = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].ns; - int nd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].nd; - int lnd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int r_grids = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].r_grids; - Package *rPackage = cpDst->r_con.rcv[i].recycleQueue.dequeue(true); - int tag = tagMap[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][g][nd][ns][graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz]; - - rPackage->request = new future<>; - rPackage->completed=false; - rPackage->tag = tag; - cpDst->r_con.rcv[i].pQueue.enqueue(rPackage,true); //!this is not done yet - //graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.enqueue(rMetaPackage,true); //!this is not done yet - - pthread_mutex_lock(&(rMsgMap.lock)); - rMsgMap.map[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][tag].push_back(rPackage); - rMsgMap.size++; - pthread_mutex_unlock(&(rMsgMap.lock)); - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } // for (ir_con.nrcv) - //pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } // if(ga locked) - //pthread_mutex_unlock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - //if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) == 0) //!no message has been received or all received messages have been claimed - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) == 0) - nextrReq = false; - else - nextrReq = true; - - if(nextrReq) //!take a message from recycle pool and post a receive - { - //Package *sMetaPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - Package *sPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(!sPackage->served) - { - sPackage->completed = false; - sPackage->served = true; - //sMetaPackage->request = new future<>; - int ns = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].ns; - int nd = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].nd; - int r_gid = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_gid; - int r_grids = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].r_grids; - //int tag = tagGen(ns, nd, r_gid-1, np*r_grids, nGraphs); - int tag = Perilla::myTagMap[r_gid][nd][ns][graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz]; - int src= upcxx::rank_me(); - int dst= graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pr; - int size= graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].sz; - - //sPackage->request = new future<>; - pthread_mutex_lock(&(sMsgMap.lock)); - sMsgMap.map[dst][tag].push_back(sPackage); - sMsgMap.size++; - pthread_mutex_unlock(&(sMsgMap.lock)); - upcxx::global_ptr sbuf= sPackage->databuf; //static_cast >((double*)sPackage->databuf); - - upcxx::rpc(dst, - [=](){ - //at destination rank, look up recv buffer and pull remote data and store data in the buffer - bool posted_recv=false; - double* localbuf= NULL; - pthread_mutex_lock(&(rMsgMap.lock)); - if(rMsgMap.map.find(src) != rMsgMap.map.end()){ - if(rMsgMap.map[src].find(tag) != rMsgMap.map[src].end()) - if(rMsgMap.map[src][tag].size() >0){ - posted_recv=true; - localbuf= (rMsgMap.map[src][tag].front()->databuf).local();//(double*) (static_cast > (rMsgMap.map[src][tag].front()->databuf).local()); - rMsgMap.map[src][tag].front()->tag= tag; - if(localbuf){ - *(rMsgMap.map[src][tag].front()->request)= upcxx::rget(sbuf, localbuf, size); - rMsgMap.map[src][tag].pop_front(); - rMsgMap.size--; - } - } - } - pthread_mutex_unlock(&(rMsgMap.lock)); - //save pull request for later when recv buffer is posted - if(posted_recv==false){ - getReq_t *req= new getReq_t(src, tag, sbuf, size); - pendingGetList.add(req); - } - //store send request to notify sender later upon completion - //sFutureMap[fu]= sMetaPackage->request; - } - ); - } //served - }//nextReq - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - for(int i=0; ir_con.nrcv; i++) - { - //if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) > 0) //!all messages before rear have completed - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) > 0) - { - //if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - Package *rearPackage = cpDst->r_con.rcv[i].pQueue.getRear(true); //graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear(true); - if(!rearPackage->completed) - { - if(rearPackage->request->ready()) - { - pthread_mutex_lock(&(cpDst->r_con.rcvLock)); - int ns = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].ns; - int nd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].nd; - int lnd = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].lnd; - int r_grids = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].r_grids; - int tag = rearPackage->tag; - //int tag = tagMap[graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr][g][nd][ns][graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].sz]; - int dst = upcxx::rank_me(); - int src= graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pr; - upcxx::rpc(src, - [=](){ - pthread_mutex_lock(&(sMsgMap.lock)); - //upcxx::future<> *ft= sMsgMap.map[dst][tag].front()->request; - //delete ft;//so that sender know - sMsgMap.map[dst][tag].front()->completed = true; - sMsgMap.map[dst][tag].pop_front(); - sMsgMap.size--; - pthread_mutex_unlock(&(sMsgMap.lock)); - } - ); - - delete rearPackage->request; - rearPackage->completed=true; - //cpDst->r_con.rcv[i].pQueue.getRear()->completeRequest(); - //graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.getRear()->completeRequest(); - - //if(graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.queueSize(true) == 1) - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) == 1) - { - cpDst->r_con.firingRuleCnt++; - } - pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } - } - //pthread_mutex_unlock(&(cpDst->r_con.rcvLock)); - } // if(ga locked) - } // if(pQueue.queueSize(true) > 0) - } // for (ir_con.nrcv) - cpDst = cpDst->next; - } // while(cpDst != 0) - } // for(ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - for(int i=0; ir_con.nsnd; i++) - { - //if(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.queueSize(true) > 0) - if(cpSrc->r_con.snd[i].pQueue.queueSize(true) >0) - { - //Package *frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.getFront(true); - Package *frontPackage = cpSrc->r_con.snd[i].pQueue.getFront(true); - if(frontPackage->served /*&& !frontPackage->completed*/) //!latest receive request has NOT been completed - { - bool flag = false; - int ret_flag; - //if(frontPackage->request==NULL)//data have been received by receiver - if(frontPackage->completed)//data have been received by receiver - { -/* - pthread_mutex_lock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - frontPackage = graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = 0; - frontPackage->tag = 0; - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); -*/ - - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - frontPackage = cpSrc->r_con.snd[i].pQueue.dequeue(true); - frontPackage->completed = false; - frontPackage->served = false; - frontPackage->request = 0; - frontPackage->tag = 0; - cpSrc->r_con.snd[i].recycleQueue.enqueue(frontPackage,true); - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - } - } - } // if(queueSize > 0) - } // for (ir_con.nsnd) - cpSrc = cpSrc->next; - } // while(cpSrc != 0) - } // for(f graphArray, int g, int nGraphs, int tg) -{ - int np = ParallelDescriptor::NProcs(); - int myProc = ParallelDescriptor::MyProc(); - int numfabs = graphArray[g]->numTasks; - //MultiFab* mf = graphArray[g]->assocMF; - int graphID = graphArray[g]->graphID; - - for(int f=0; ftask[f]->cpAsc_srcHead; - while(cpSrc != 0) - { - if(cpSrc->r_con.remotePushReady) - { - pthread_mutex_lock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(graphArray[g]->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - } - pthread_mutex_unlock(&(graphArray[g]->sCopyMapHead->map[f]->r_con.sndLock)); - cpSrc->r_con.remotePushReady = false; - }// if remotepushready - cpSrc = cpSrc->next; - } - }// ismyRegion - }//for ftask[f]->cpAsc_dstHead; - while(cpDst != 0) - { - if(pthread_mutex_trylock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)) != 0) - { - if(pthread_mutex_trylock(&(cpDst->r_con.rcvLock)) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = 0; - graphArray[g]->rCopyMapHead->map[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->completed = false; - cpDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpDst->r_con.firingRuleCnt = cpDst->r_con.firingRuleCnt - 1; - - if(cpDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpDst->r_con.remotePullDone = false; - - //if(f==1 && g==26 && myProc == 54) - // std::cout<<"Completed Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpDst->r_con.remotePullDone <r_con.rcvLock)); - } // if(ga locked) - pthread_mutex_unlock(&(graphArray[g]->rCopyMapHead->map[f]->r_con.rcvLock)); - } // if(mf locked) - cpDst = cpDst->next; - } // while(cpDst != 0) - /* - if(false) - for(int id=0; idtask[f]->depTaskIDs.size(); id++) - { - int df = graphArray[g]->task[f]->depTaskIDs[id]; - if(WorkerThread::isMyRegion(0,df)) - { - int lgID = graphArray[g]->srcLinkGraph->graphID-1; - - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Dep Push f " << df << " gID " << lgID+1 << " myP " << myProc <task[df]->cpAsc_dstHead; - while(cpdDst != 0) - { - if(omp_test_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock) != 0) - { - if(omp_test_lock(cpdDst->r_con.rcvLock) != 0) - { - //if(f==1 && g==26 && myProc == 54) - //std::cout<<"Completing Push f " << f << " gID " << g+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.remotePullDone) - { - for(int i=0; ir_con.nrcv; i++) - { - - Package *rcvMetaPackage = graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = MPI_REQUEST_NULL; - graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - - Package* rcvPackage = cpdDst->r_con.rcv[i].pQueue.dequeue(true); // corrected from recycleQ to pQ - rcvPackage->completed = false; - cpdDst->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); // corrected from pQ to recycleQ - - //cpdDst->r_con.firingRuleCnt = cpdDst->r_con.firingRuleCnt - 1; - - if(cpdDst->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(cpdDst->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - cpdDst->r_con.firingRuleCnt++; - - - } // for (ir_con.nrcv) - - cpdDst->r_con.remotePullDone = false; - - //if(df==10 && lgID==24 && myProc == 54) - // std::cout<<"Completed Push f " << df << " gID " << lgID+1 << " myP " << myProc << " PDone "<< cpdDst->r_con.remotePullDone <r_con.rcvLock); - } // if(ga locked) - omp_unset_lock(graphArray[lgID]->rCopyMapHead->map[df]->r_con.rcvLock); - } // if(mf locked) - cpdDst = cpdDst->next; - } // while(cpdDst != 0) - - - } // if tg==0 region - - - } // for all dependents - */ - - - - } - } // for(f= 1"<< endl; - if(mfDst->nComp() < (dstcomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for dst multifab"<< endl; - if(mfSrc->nComp() < (srccomp-1)) cout <<"MULTIFAB_COPY_C: nc too large for src multifab"<< endl; - - //mTeams = false; - -// if(np==1) - //multifabCopyPush_1Team(destGraph,srcGraph,mfDst,mfSrc,f,dstcomp,srccomp,nc,ng,ngsrc,singleT); -/* else if(mTeams) - { - if(WorkerThread::isLocPPTID(tid)) - multifabCopyLocPush(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc); - else - multifabCopyRmtPush(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc); - } - else - multifabCopyPush_1Team(destGraph,srcGraph,mfDst,mfSrc,f,tid,dstcomp,srccomp,nc,ng,ngsrc,singleT); -*/ - - multifabCopyPush_1Team(destGraph,srcGraph,mfDst,mfSrc,f,dstcomp,srccomp,nc,ng,ngsrc,singleT); - //if(!singleT) - //srcGraph->worker[perilla::wid()]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - //double end_time_wtime = omp_get_wtime(); - //if(ntid==0) - //Perilla::getPPPTimeSplit[2] += end_time_wtime - start_time_wtime; -} - - void Perilla::multifabCopyPush_1Team(RegionGraph* destGraph, RegionGraph* srcGraph, amrex::MultiFab* mfDst, amrex::MultiFab* mfSrc, int f, int dstcomp, int srccomp, int nc, int ng, int ngsrc, bool singleT) - { - int ntid = perilla::wtid();// - perilla::NUM_COMM_THREADS; - int tg = perilla::wid(); - int myProc = amrex::ParallelDescriptor::MyProc(); - - if(true)//if(!(*mfDst == *mfSrc)) - { - if(ng > mfDst->nGrow()) cout <<"MULTIFAB_COPY_C: ng > 0 not supported in parallel copy"<< endl; - if(ngsrc > mfSrc->nGrow()) cout <<"MULTIFAB_COPY_C: ngsrc > msrc%ng"<< endl; - FabCopyAssoc* cpSrc = srcGraph->task[f]->cpAsc_srcHead; - - while(cpSrc != 0) - { - if(cpSrc->graphPartner == destGraph) - break; - cpSrc = cpSrc->next; - } - if(cpSrc == 0) cout <<"Metadata for across grid copy not found"<< endl; - - if(singleT) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf.local()); - } - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->l_con.sLock)); - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - //std::ofstream fout; - //fout.open(std::to_string(myProc)+ "_" + std::to_string(tid) + ".txt", std::fstream::app); - //if((i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - for(int i=0; il_con.nscpy; i++) - { - Package* sndPackage = cpSrc->l_con.scpy[i].recycleQueue.getFront(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->l_con.scpy[i].sbx,srccomp,nc,sndPackage->databuf.local()); - /* - for(int ii=0; ii < sndPackage->bufSize; ii++) - if(sndPackage->databuf[ii] == 0) - fout << "MFCPush loc zero at " << f << " i " << i << " ii " << ii << " sbx "<< cpSrc->l_con.scpy[i].sbx << std::endl; - */ - } - - //fout.close(); - - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - //if(ntid==0) - //{ - for(int i=0;il_con.nscpy; i++) - cpSrc->l_con.scpy[i].pQueue.enqueue(cpSrc->l_con.scpy[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(cpSrc->l_con.sLock)); - } - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - - int np = amrex::ParallelDescriptor::NProcs(); - if(np == 1) - return; - - //if(myProc==26 && srcGraph->graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && ntid == 0) - //std::cout << "Notw its sgID 18,"<< f <<" turn lets see " << cpSrc->r_con.nsnd <graphID==18 && f ==316) - //BL_ASSERT(cpSrc->r_con.nsnd == 177); - if(singleT) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf.local()); - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - cpSrc->r_con.remotePushReady = true; - - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - - /* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - */ - } - else - { - if(ntid == 0) - { - pthread_mutex_lock(&(cpSrc->r_con.sndLock)); - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - - //if((i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - for(int i=0; ir_con.nsnd; i++) - { - Package* sndPackage = cpSrc->r_con.snd[i].recycleQueue.dequeue(true); - mfSrc->m_fabs_v[f]->copyToMem(cpSrc->r_con.snd[i].sbx,srccomp,nc,sndPackage->databuf.local()); - sndPackage->served = false; - sndPackage->completed = false; - cpSrc->r_con.snd[i].pQueue.enqueue(sndPackage,true); - } - - //fout.close(); - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - //if(ntid==0) - //{ - cpSrc->r_con.remotePushReady = true; - /* - pthread_mutex_lock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - for(int i=0; ir_con.nsnd; i++) - srcGraph->sCopyMapHead->map[f]->r_con.snd[i].pQueue.enqueue(srcGraph->sCopyMapHead->map[f]->r_con.snd[i].recycleQueue.dequeue(true),true); - pthread_mutex_unlock(&(srcGraph->sCopyMapHead->map[f]->r_con.sndLock)); - */ - pthread_mutex_unlock(&(cpSrc->r_con.sndLock)); - } - //srcGraph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); - } - } // if(!(*mfDst == *mfSrc)) - } // multifabCopyPush - - - - void Perilla::fillBoundaryPull_1Team(RegionGraph* graph, amrex::MultiFab& mf, int f) - { - int myProc = amrex::ParallelDescriptor::MyProc(); - int mfi = mf.IndexArray()[f]; - - int nComp = mf.nComp(); - int tg= perilla::wid(); - int ntid = perilla::wtid();//-perilla::NUM_COMM_THREADS; - - if(ntid==0) - { - pthread_mutex_lock(&(graph->lMap[f]->l_con.dLock)); - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - - if(perilla::LAZY_PUSH) - { } - else - { - if(perilla::UNPACKING_FINEGRAIN) - {} - else - { - //if( (i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - { - Package *dPackage = graph->lMap[f]->l_con.dcpy[i].pQueue.getFront(true); - /* - for(int d=0; dbufSize; d++) - if(dPackage->databuf[d] == 0) - { - //std::cout<< "in fbPull Reciving 0 for f "<< f <databuf[d] != 0); - } - */ - /* - if(f==0) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd() == graph->lMap[f]->l_con.dcpy[i].dbx.bigEnd()) - //if(graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(0)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(1)==-1 && graph->lMap[f]->l_con.dcpy[i].dbx.smallEnd(2)==4) - std::cout<< "Corner Pull for f "<< f << " data0 " <databuf[0]<< " size " <bufSize <<" se " <lMap[f]->l_con.dcpy[i].dbx.smallEnd()<bufSize; d++) - std::cout << dPackage->databuf[d] << " "; - std::cout << std::endl; - } - */ - mf.m_fabs_v[f]->copyFromMem(graph->lMap[f]->l_con.dcpy[i].dbx,0,nComp,dPackage->databuf.local()); - } - } // if(UNPACKING_FINEGRAIN) - else - } // if(LAZY_PUSH) - else - - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - - //if(ntid==0) - //{ - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - { - graph->lMap[f]->l_con.dcpy[i].recycleQueue.enqueue( graph->lMap[f]->l_con.dcpy[i].pQueue.dequeue(true),true ); - } - - graph->lMap[f]->l_con.firingRuleCnt = graph->lMap[f]->l_con.firingRuleCnt - graph->lMap[f]->l_con.ndcpy; - - - graph->lMap[f]->l_con.scpyCnt = 0; - for(int i=0; ilMap[f]->l_con.ndcpy; i++) - if(graph->lMap[f]->l_con.dcpy[i].pQueue.queueSize(true) >= 1) - { - graph->lMap[f]->l_con.firingRuleCnt++; - } - - pthread_mutex_unlock(&(graph->lMap[f]->l_con.dLock)); - } - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - - int np = amrex::ParallelDescriptor::NProcs(); - if (np==1) return; - if(ntid==0) - { - pthread_mutex_lock(&(graph->rMap[f]->r_con.rcvLock)); - pthread_mutex_lock(&(graph->lMap[f]->r_con.rcvLock)); - //} - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - - //if( (i%(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS)) == ntid) - for(int i=0; ilMap[f]->r_con.nrcv; i++) - { - Package *rcvMetaPackage = graph->rMap[f]->r_con.rcv[i].pQueue.dequeue(true); - rcvMetaPackage->completed = false; - rcvMetaPackage->served = false; - rcvMetaPackage->request = 0; - graph->rMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvMetaPackage,true); - Package *rcvPackage = graph->lMap[f]->r_con.rcv[i].pQueue.dequeue(true); - - mf.m_fabs_v[f]->copyFromMem(graph->lMap[f]->r_con.rcv[i].dbx,0,nComp,rcvPackage->databuf.local()); - rcvPackage->completed = false; - graph->lMap[f]->r_con.rcv[i].recycleQueue.enqueue(rcvPackage,true); - } - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - - //if(ntid==0) - //{ - graph->lMap[f]->r_con.firingRuleCnt = graph->lMap[f]->r_con.firingRuleCnt - graph->lMap[f]->r_con.nrcv; - for(int i=0; ilMap[f]->r_con.nrcv; i++) - if(graph->lMap[f]->r_con.rcv[i].pQueue.queueSize(true) >= 1) - if(graph->lMap[f]->r_con.rcv[i].pQueue.getFront(true)->checkRequest()) - graph->lMap[f]->r_con.firingRuleCnt++; - pthread_mutex_unlock(&(graph->lMap[f]->r_con.rcvLock)); - pthread_mutex_unlock(&(graph->rMap[f]->r_con.rcvLock)); - } - //graph->worker[tg]->barr->sync(perilla::NUM_THREADS_PER_TEAM-perilla::NUM_COMM_THREADS); // Barrier to synchronize team threads - } // fillBoundaryPull - diff --git a/Src/AmrTask/rts_impls/upcxx/PerillaConfig.H b/Src/AmrTask/rts_impls/upcxx/PerillaConfig.H deleted file mode 100755 index 8b2609001e1..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/PerillaConfig.H +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef P_PERILLACONFIG_H -#define P_PERILLACONFIG_H - -//class PerillaConfig -namespace perilla -{ - static const int NUM_THREAD_TEAMS=4; - static const int NUM_THREADS_PER_TEAM=4; - static const int NUM_COMM_THREADS=1; - static const int NUM_PREGENERATED_PACKAGES=3; - static const int MSG_QUEUE_DEFAULT_MAXSIZE=3; // same as num of pregenerated packages because these got swaped between pQ and recycleQ - static const int TASK_QUEUE_DEFAULT_SIZE=512; - static const int MAX_SQRT_TAG=512; //sqrt(512*512) - static const bool NUMA_AWARE_MESSAGEHANDLER=true; - static const int LOCK_GRANULARITY=1; //!0 is queue level, 1 is region level - - static const bool LAZY_PUSH=false; - static const bool LAZY_PULL=false; - static const bool PACKING_FINEGRAIN=false; //!PACKING_FINEGRAIN = TRUE is effective when LAZY_PUSH=FALSE - static const bool UNPACKING_FINEGRAIN=false; //!is effective when LAZY_PUSH=FALSE -}; -#endif diff --git a/Src/AmrTask/rts_impls/upcxx/PerillaRts.H b/Src/AmrTask/rts_impls/upcxx/PerillaRts.H deleted file mode 100644 index 23d4676dfdf..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/PerillaRts.H +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include - - -namespace perilla{ - -struct _workerThreadInfo{ - int _tid; //thread id in local group - int _size; //number of threads in the group -}; - -struct _threadInfo{ - bool _isComm; //whether this thread handles communication - int _wtid; //worker thread id (-1 if this thread is decicated to communication) - int _nWts; //number of thread groups -}; - -class RTS -{ - private: - int _nWrks; - void RTS_Init(); - int _rank, _nProcs; - - public: - RTS(){ - _nWrks=1; - char* nWrks= getenv("NWORKERS"); - if(nWrks) _nWrks= atoi(nWrks); - } - RTS(int nWrks):_nWrks(nWrks){} - int ProcCount(); - int MyProc(); - int WorkerThreadCount(); - int MyWorkerThread(); - void Init(); //Build the runtime system from scratch - void Init(int rank, int nProcs);//Build the runtime system on pre-existing MPI processes - void Iterate(void *graph, int max_step, Real stop_time); - void Finalize(); -// double Time(); - void Barrier(); - void runAMR(Amr* amrptr, int max_step, Real stop_time); -}; - -} diff --git a/Src/AmrTask/rts_impls/upcxx/PerillaRts.cpp b/Src/AmrTask/rts_impls/upcxx/PerillaRts.cpp deleted file mode 100644 index 7dae4f39e11..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/PerillaRts.cpp +++ /dev/null @@ -1,211 +0,0 @@ -//Question? email tannguyen@lbl.gov -//Created 07-19-2017 -//Last modification 08-14-2017 -#include -#include -#include -#include -#include -#include -#include "PerillaRts.H" - -#include -#include -using namespace std; -#include - -using namespace perilla; -#ifdef PERILLA_DEBUG -#include -PerillaMemCheck memcheck; -#endif - -namespace perilla{ - Amr* amrptr; - struct RtsDomain{ - pthread_t *_threads; - int _size; - MyLock _lock; - RtsDomain():_threads(NULL), _size(0){}; - ~RtsDomain(){ - free(_threads); - } - }; - int numa_nodes; - RtsDomain *dom; - MyLock _l; - volatile char startSignal=0; - pthread_mutex_t startLock= PTHREAD_MUTEX_INITIALIZER; - - int RTS::ProcCount(){ - return _nProcs; - } - - int RTS::MyProc(){ - return _rank; - } - - int RTS::WorkerThreadCount(){ - return _nWrks; - } - - int RTS::MyWorkerThread(){ - return 0; - } - - struct argT { - int numaID; - int tid; - int g_tid; - int nThreads; - int nTotalThreads; - int max_step; - Real stop_time; - RTS* thisRTS; - }; - - void RTS::runAMR(Amr* amr, int max_step, Real stop_time){ - while (amr->okToContinue() && - (amr->levelSteps(0) < max_step || max_step < 0) && - (amr->cumTime() < stop_time || stop_time < 0.0) ) - - { - // Do a coarse timestep, which calls one or multiple timestep updates (i.e. timeStep()) at each AMR level - amr->coarseTimeStep(stop_time); - } - } - -#ifdef USE_PERILLA_PTHREADS - void run(void* threadInfo){ - argT *args= (argT*)threadInfo; - int numaID= args->numaID; - int tid= args->tid; - int g_tid= args->g_tid; - int nThreads= args->nThreads; - int nTotalThreads= args->nTotalThreads; - int max_step= args->max_step; - Real stop_time= args->stop_time; - RTS* rts= args->thisRTS; - Perilla::registerId(g_tid); - //done with thread id setup, now wait for the start signal from master - pthread_mutex_lock(&startLock); - startSignal++; - pthread_mutex_unlock(&startLock); - while(startSignal!= nTotalThreads){} - rts->runAMR(amrptr, max_step, stop_time); - } -#endif - - void InitializeMPI(){ - int provided; - MPI_Init_thread(0, 0, MPI_THREAD_FUNNELED, &provided); - if(provided == MPI_THREAD_SINGLE){//with this MPI, process can't spawn threads - cerr << "Spawning threads is not allowed by the MPI implementation" << std::endl;; - } - } - - void RTS::RTS_Init(){ - amrptr= NULL; - } - - void RTS::Init(){ - InitializeMPI(); - MPI_Comm_rank(MPI_COMM_WORLD, &_rank); - MPI_Comm_size(MPI_COMM_WORLD, &_nProcs); - RTS_Init(); - } - - void RTS::Init(int rank, int nProcs){ - _rank= rank; - _nProcs= nProcs; - RTS_Init(); - } - - void RTS::Finalize(){ -#ifdef PERILLA_DEBUG - memcheck.report(); -#endif - } - - void RTS::Iterate(void* amrGraph, int max_step, Real stop_time){ - assert(amrGraph); - Perilla::max_step= max_step; - amrptr= (Amr*)amrGraph; - WorkerThread::init(); -#ifndef USE_PERILLA_PTHREADS - runAMR(amrptr, max_step, stop_time); -#else - int numa_nodes= perilla::NUM_THREAD_TEAMS; - int worker_per_numa = perilla::NUM_THREADS_PER_TEAM; - int _nWrks= numa_nodes*worker_per_numa; - int base=0; - int localID=-1; - //create a list of persistent threads for each NUMA node - cpu_set_t cpuset; - pthread_attr_t attr; - pthread_attr_init(&attr); - dom= new RtsDomain[numa_nodes]; - for(int i=0; inumaID= domNo; - arg->tid= localID; - arg->g_tid= domNo*worker_per_numa+localID; - arg->nThreads= worker_per_numa; - arg->nTotalThreads= _nWrks; - arg->thisRTS= this; - arg->max_step= max_step; - arg->stop_time= stop_time; - int err = pthread_create(&(dom[domNo]._threads[localID]), &attr, (void*(*)(void*))run, arg); - }else{ //master thread - dom[domNo]._threads[localID]= pthread_self(); - Perilla::registerId(0); - //enable worker threads to start computing - pthread_mutex_lock(&startLock); - startSignal++; - pthread_mutex_unlock(&startLock); - } - dom[domNo]._size++; - if(localID == (worker_per_numa-1)){ - localID=-1; - base+= worker_per_numa; - } - } - while(startSignal!= _nWrks){}//wait until all threads have done the setup phase - runAMR(amrptr, max_step, stop_time); - for(int i=1; i<_nWrks; i++) pthread_join(dom[i/worker_per_numa]._threads[i%worker_per_numa], NULL); -#endif - } - -#if 0 - const double kMicro = 1.0e-6; - double RTS::Time() - { - struct timeval TV; - - const int RC = gettimeofday(&TV, NULL); - if(RC == -1) - { - printf("ERROR: Bad call to gettimeofday\n"); - return(-1); - } - return( ((double)TV.tv_sec) + kMicro * ((double)TV.tv_usec) ); - } -#endif - - void RTS::Barrier(){ - upcxx::barrier(); - } - -}//end namespace - diff --git a/Src/AmrTask/rts_impls/upcxx/perilla.mak b/Src/AmrTask/rts_impls/upcxx/perilla.mak deleted file mode 100755 index 757d77878da..00000000000 --- a/Src/AmrTask/rts_impls/upcxx/perilla.mak +++ /dev/null @@ -1,20 +0,0 @@ -CEXE_sources += Barrier.cpp -CEXE_sources += PackageQueue.cpp -CEXE_sources += Perilla.cpp -CEXE_sources += RGIter.cpp -CEXE_sources += RegionGraph.cpp -CEXE_sources += WorkerThread.cpp -CEXE_sources += AsyncMultiFabUtil.cpp - - -CEXE_headers += Barrier.H -CEXE_headers += Config.H -CEXE_headers += LocalConnection.H -CEXE_headers += PackageQueue.H -CEXE_headers += RegionGraph.H -CEXE_headers += RemoteConnection.H -CEXE_headers += WorkerThread.H -CEXE_headers += AsyncMultiFabUtil.H - - - diff --git a/Src/AmrTask/todolist b/Src/AmrTask/todolist deleted file mode 100644 index 3e52aba3d1b..00000000000 --- a/Src/AmrTask/todolist +++ /dev/null @@ -1 +0,0 @@ --Make the graph instantiation NUMA aware (important when there are tasks that allocate data) diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H index 8e2b33fb4fa..f56250ab4fc 100644 --- a/Src/Base/AMReX.H +++ b/Src/Base/AMReX.H @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include #include #include @@ -88,41 +90,77 @@ namespace amrex void ExecOnFinalize (PTR_TO_VOID_FUNC); void ExecOnInitialize (PTR_TO_VOID_FUNC); + //! This shuts up the compiler about unused variables + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void ignore_unused (const Ts&...) {} + //! Print out message to cerr and exit via amrex::Abort(). void Error (const std::string& msg); - namespace detail { void Error_host_doit (const char * msg); } - AMREX_GPU_HOST_DEVICE inline - void Error (const char * msg = 0) { + + void Error_host (const char* msg); + +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) + AMREX_GPU_EXTERNAL AMREX_GPU_DEVICE AMREX_NO_INLINE + void Error_device (const char * msg); +#endif + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void Error (const char* msg = 0) { #if AMREX_DEVICE_COMPILE - if (msg) AMREX_DEVICE_PRINTF("Error %s\n", msg); - AMREX_DEVICE_ASSERT(0); +#ifdef NDEBUG + amrex::ignore_unused(msg); +#else + Error_device(msg); +#endif #else - detail::Error_host_doit(msg); + Error_host(msg); #endif } //! Print out warning message to cerr. void Warning (const std::string& msg); - namespace detail { void Warning_host_doit (const char * msg); } - AMREX_GPU_HOST_DEVICE inline + + void Warning_host (const char * msg); + +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) + AMREX_GPU_EXTERNAL AMREX_GPU_DEVICE AMREX_NO_INLINE + void Warning_device (const char * msg); +#endif + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Warning (const char * msg) { #if AMREX_DEVICE_COMPILE - if (msg) AMREX_DEVICE_PRINTF("Warning %s\n", msg); +#ifdef NDEBUG + amrex::ignore_unused(msg); #else - detail::Warning_host_doit(msg); + Warning_device(msg); +#endif +#else + Warning_host(msg); #endif } //! Print out message to cerr and exit via abort(). void Abort (const std::string& msg); - namespace detail { void Abort_host_doit (const char * msg); } - AMREX_GPU_HOST_DEVICE inline + + void Abort_host (const char * msg); + +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) + AMREX_GPU_EXTERNAL AMREX_GPU_DEVICE AMREX_NO_INLINE + void Abort_device (const char * msg); +#endif + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Abort (const char * msg = 0) { #if AMREX_DEVICE_COMPILE - if (msg) AMREX_DEVICE_PRINTF("Abort %s\n", msg); - AMREX_DEVICE_ASSERT(0); +#ifdef NDEBUG + amrex::ignore_unused(msg); #else - detail::Abort_host_doit(msg); + Abort_device(msg); +#endif +#else + Abort_host(msg); #endif } @@ -131,32 +169,27 @@ namespace amrex * via abort(). Intended for use by the BL_ASSERT() macro * in . */ - namespace detail { void Assert_host_doit (const char* EX, const char* file, int line, - const char* msg); } - AMREX_GPU_HOST_DEVICE inline + + void Assert_host (const char* EX, const char* file, int line, const char* msg); + +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) + AMREX_GPU_EXTERNAL AMREX_GPU_DEVICE AMREX_NO_INLINE + void Assert_device (const char* EX, const char* file, int line, const char* msg); +#endif + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) { #if AMREX_DEVICE_COMPILE - if (msg) { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", - EX, file, line, msg); - } else { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", - EX, file, line); - } - AMREX_DEVICE_ASSERT(0); +#ifdef NDEBUG + amrex::ignore_unused(EX,file,line,msg); +#else + Assert_device(EX,file,line,msg); +#endif #else - detail::Assert_host_doit(EX, file, line, msg); + Assert_host(EX,file,line,msg); #endif } - /** - * \brief Prints out an out-of-memory message and aborts. It is - * called by various routines when a call to new fails. - * - * Called as amrex::OutOfMemory(__FILE__, __LINE__); - */ - void OutOfMemory (const char* file, int line); - /** * \brief This is used by amrex::Error(), amrex::Abort(), and amrex::Assert() * to ensure that when writing the message to stderr, that no additional diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index a727099aad0..9b2f9b47e99 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -1,15 +1,4 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include #include #include #include @@ -20,7 +9,9 @@ #include #include #include - +#include +#include +#include #include #ifdef AMREX_USE_CUPTI @@ -53,10 +44,22 @@ #include #endif -#include -#include +#if defined(__APPLE__) +#include +#endif -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace amrex { @@ -145,7 +148,7 @@ amrex::write_to_stderr_without_buffering (const char* str) } } -static +namespace { void write_lib_id(const char* msg) { @@ -158,9 +161,28 @@ write_lib_id(const char* msg) fwrite("::", 2, 1, stderr); } } +} void -amrex::detail::Error_host_doit (const char* msg) +amrex::Error (const std::string& msg) +{ + Error(msg.c_str()); +} + +void +amrex::Abort (const std::string& msg) +{ + Abort(msg.c_str()); +} + +void +amrex::Warning (const std::string& msg) +{ + Warning(msg.c_str()); +} + +void +amrex::Error_host (const char * msg) { if (system::error_handler) { system::error_handler(msg); @@ -169,18 +191,46 @@ amrex::detail::Error_host_doit (const char* msg) } else { write_lib_id("Error"); write_to_stderr_without_buffering(msg); +#ifdef _OPENMP +#pragma omp critical (amrex_abort_omp_critical) +#endif ParallelDescriptor::Abort(); } } +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) +#if AMREX_DEVICE_COMPILE +AMREX_GPU_DEVICE void -amrex::Error (const std::string& msg) +amrex::Error_device (const char * msg) { - Error(msg.c_str()); + if (msg) AMREX_DEVICE_PRINTF("Error %s\n", msg); + AMREX_DEVICE_ASSERT(0); +} +#endif +#endif + +void +amrex::Warning_host (const char * msg) +{ + if (msg) { + amrex::Print(Print::AllProcs,amrex::ErrorStream()) << msg << '!' << '\n'; + } +} + +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) +#if AMREX_DEVICE_COMPILE +AMREX_GPU_DEVICE +void +amrex::Warning_device (const char * msg) +{ + if (msg) AMREX_DEVICE_PRINTF("Warning %s\n", msg); } +#endif +#endif void -amrex::detail::Abort_host_doit (const char* msg) +amrex::Abort_host (const char * msg) { if (system::error_handler) { system::error_handler(msg); @@ -196,29 +246,20 @@ amrex::detail::Abort_host_doit (const char* msg) } } +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) +#if AMREX_DEVICE_COMPILE +AMREX_GPU_DEVICE void -amrex::Abort (const std::string& msg) +amrex::Abort_device (const char * msg) { - Abort(msg.c_str()); -} - -void -amrex::detail::Warning_host_doit (const char* msg) -{ - if (msg) - { - amrex::Print(Print::AllProcs,amrex::ErrorStream()) << msg << '!' << '\n'; - } -} - -void -amrex::Warning (const std::string& msg) -{ - Warning(msg.c_str()); + if (msg) AMREX_DEVICE_PRINTF("Abort %s\n", msg); + AMREX_DEVICE_ASSERT(0); } +#endif +#endif void -amrex::detail::Assert_host_doit (const char* EX, const char* file, int line, const char* msg) +amrex::Assert_host (const char* EX, const char* file, int line, const char* msg) { const int N = 512; @@ -247,10 +288,31 @@ amrex::detail::Assert_host_doit (const char* EX, const char* file, int line, con throw RuntimeError(buf); } else { write_to_stderr_without_buffering(buf); +#ifdef _OPENMP +#pragma omp critical (amrex_abort_omp_critical) +#endif ParallelDescriptor::Abort(); } } +#if defined(AMREX_USE_GPU) && !defined(NDEBUG) +#if AMREX_DEVICE_COMPILE +AMREX_GPU_DEVICE +void +amrex::Assert_device (const char* EX, const char* file, int line, const char* msg) +{ + if (msg) { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", + EX, file, line, msg); + } else { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", + EX, file, line); + } + AMREX_DEVICE_ASSERT(0); +} +#endif +#endif + namespace { std::stack The_Finalize_Function_Stack; @@ -315,13 +377,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, if (argc > 0) { if (argv[0][0] != '/') { - constexpr int bufSize = 1024; - char temp[bufSize]; - char *rCheck = getcwd(temp, bufSize); - if(rCheck == 0) { - amrex::Abort("**** Error: getcwd buffer too small."); - } - system::exename = temp; + system::exename = FileSystem::CurrentPath(); system::exename += "/"; } system::exename += argv[0]; @@ -333,7 +389,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, } } -#if defined(PERILLA_USE_UPCXX) || defined(AMREX_USE_UPCXX) +#if defined(AMREX_USE_UPCXX) upcxx::init(); #endif @@ -433,7 +489,8 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, if (invalid) curr_fpe_excepts |= FE_INVALID; if (divbyzero) curr_fpe_excepts |= FE_DIVBYZERO; if (overflow) curr_fpe_excepts |= FE_OVERFLOW; -#if !defined(__PGI) || (__PGIC__ >= 16) +#if !defined(AMREX_USE_DPCPP) && (!defined(__PGI) || (__PGIC__ >= 16)) + // xxxxx DPCPP todo: fpe trap prev_fpe_excepts = fegetexcept(); if (curr_fpe_excepts != 0) { feenableexcept(curr_fpe_excepts); // trap floating point exceptions @@ -482,6 +539,9 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, BL_PROFILE_INITPARAMS(); #endif machine::Initialize(); +#ifdef AMREX_USE_GPU + Gpu::Fuser::Initialize(); +#endif if (system::verbose > 0) { @@ -611,7 +671,7 @@ amrex::Finalize (amrex::AMReX* pamrex) Gpu::Device::Finalize(); #endif -#if defined(PERILLA_USE_UPCXX) || defined(AMREX_USE_UPCXX) +#if defined(AMREX_USE_UPCXX) upcxx::finalize(); #endif diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H index a9aa54d6038..06d3055a9ed 100644 --- a/Src/Base/AMReX_Algorithm.H +++ b/Src/Base/AMReX_Algorithm.H @@ -6,6 +6,7 @@ #include #include #include +#include namespace amrex { @@ -55,6 +56,50 @@ namespace amrex t1 = std::move(t2); t2 = std::move(temp); } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + constexpr const T& Clamp (const T& v, const T& lo, const T& hi) + { + return (v < lo) ? lo : (hi < v) ? hi : v; + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T bisect (T lo, T hi, F f, T tol=1e-12, int max_iter=100) + { + AMREX_ASSERT_WITH_MESSAGE(hi > lo, + "Error - calling bisect but lo and hi don't describe a reasonable interval."); + + T flo = f(lo); + T fhi = f(hi); + + if (flo == T(0)) return flo; + if (fhi == T(0)) return fhi; + + AMREX_ASSERT_WITH_MESSAGE(flo * fhi <= T(0), + "Error - calling bisect but lo and hi don't bracket a root."); + + T mi = (lo + hi) / T(2); + T fmi = 0.0; + int n = 1; + while (n <= max_iter) + { + if (hi - lo < tol) break; + mi = (lo + hi) / T(2); + fmi = f(mi); + if (fmi == T(0)) break; + fmi*flo < T(0) ? hi = mi : lo = mi; + flo = f(lo); + fhi = f(hi); + ++n; + } + + AMREX_ASSERT_WITH_MESSAGE(n < max_iter, + "Error - maximum number of iterations reached in bisect."); + + return mi; + } } #endif diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index 7cfa8d4c209..cc230e41a86 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -89,8 +89,6 @@ public: static void PrintUsage (); static void Finalize (); -protected: - #if 0 union Word { @@ -104,6 +102,8 @@ protected: static const std::size_t align_size = 16; +protected: + ArenaInfo arena_info; void* allocate_system (std::size_t nbytes); diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp index 51344845933..044cb86fc7b 100644 --- a/Src/Base/AMReX_Arena.cpp +++ b/Src/Base/AMReX_Arena.cpp @@ -11,7 +11,17 @@ #include #include +#ifdef _WIN32 +///#include +//#define AMREX_MLOCK(x,y) VirtualLock(x,y) +//#define AMREX_MUNLOCK(x,y) VirtualUnlock(x,y) +#define AMREX_MLOCK(x,y) ((void)0) +#define AMREX_MUNLOCK(x,y) ((void)0) +#else #include +#define AMREX_MLOCK(x,y) mlock(x,y) +#define AMREX_MUNLOCK(x,y) munlock(x,y) +#endif namespace amrex { @@ -27,6 +37,11 @@ namespace { bool use_buddy_allocator = false; Long buddy_allocator_size = 0L; Long the_arena_init_size = 0L; +#ifdef AMREX_USE_HIP + bool the_arena_is_managed = false; // xxxxx HIP FIX HERE +#else + bool the_arena_is_managed = true; +#endif bool abort_on_out_of_gpu_memory = false; } @@ -54,7 +69,7 @@ Arena::allocate_system (std::size_t nbytes) if (arena_info.use_cpu_memory) { p = std::malloc(nbytes); - if (p && arena_info.device_use_hostalloc) mlock(p, nbytes); + if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes); } else if (arena_info.device_use_hostalloc) { @@ -76,9 +91,9 @@ Arena::allocate_system (std::size_t nbytes) if (arena_info.device_use_managed_memory) { AMREX_HIP_OR_CUDA_OR_DPCPP - (AMREX_HIP_SAFE_CALL(hipMalloc(&p, nbytes));, + (AMREX_HIP_SAFE_CALL(hipMallocManaged(&p, nbytes));, AMREX_CUDA_SAFE_CALL(cudaMallocManaged(&p, nbytes));, - p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());); + p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext())); if (arena_info.device_set_readonly) { Gpu::Device::mem_advise_set_readonly(p, nbytes); @@ -94,12 +109,12 @@ Arena::allocate_system (std::size_t nbytes) AMREX_HIP_OR_CUDA_OR_DPCPP (AMREX_HIP_SAFE_CALL ( hipMalloc(&p, nbytes));, AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, nbytes));, - p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());); + p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext())); } } #else p = std::malloc(nbytes); - if (p && arena_info.device_use_hostalloc) mlock(p, nbytes); + if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes); #endif if (p == nullptr) amrex::Abort("Sorry, malloc failed"); return p; @@ -111,7 +126,7 @@ Arena::deallocate_system (void* p, std::size_t nbytes) #ifdef AMREX_USE_GPU if (arena_info.use_cpu_memory) { - if (p && arena_info.device_use_hostalloc) munlock(p, nbytes); + if (p && arena_info.device_use_hostalloc) AMREX_MUNLOCK(p, nbytes); std::free(p); } else if (arena_info.device_use_hostalloc) @@ -126,10 +141,10 @@ Arena::deallocate_system (void* p, std::size_t nbytes) AMREX_HIP_OR_CUDA_OR_DPCPP (AMREX_HIP_SAFE_CALL ( hipFree(p));, AMREX_CUDA_SAFE_CALL(cudaFree(p));, - sycl::free(p,Gpu::Device::syclContext());); + sycl::free(p,Gpu::Device::syclContext())); } #else - if (p && arena_info.device_use_hostalloc) munlock(p, nbytes); + if (p && arena_info.device_use_hostalloc) AMREX_MUNLOCK(p, nbytes); std::free(p); #endif } @@ -150,6 +165,7 @@ Arena::Initialize () pp.query("use_buddy_allocator", use_buddy_allocator); pp.query("buddy_allocator_size", buddy_allocator_size); pp.query("the_arena_init_size", the_arena_init_size); + pp.query("the_arena_is_managed", the_arena_is_managed); pp.query("abort_on_out_of_gpu_memory", abort_on_out_of_gpu_memory); #ifdef AMREX_USE_GPU @@ -165,13 +181,21 @@ Arena::Initialize () } std::size_t chunk = 512*1024*1024; buddy_allocator_size = (buddy_allocator_size/chunk) * chunk; - the_arena = new DArena(buddy_allocator_size, 512, ArenaInfo().SetPreferred()); + if (the_arena_is_managed) { + the_arena = new DArena(buddy_allocator_size, 512, ArenaInfo().SetPreferred()); + } else { + the_arena = new DArena(buddy_allocator_size, 512, ArenaInfo().SetDeviceMemory()); + } } else #endif { #if defined(BL_COALESCE_FABS) || defined(AMREX_USE_GPU) - the_arena = new CArena(0, ArenaInfo().SetPreferred()); + if (the_arena_is_managed) { + the_arena = new CArena(0, ArenaInfo().SetPreferred()); + } else { + the_arena = new CArena(0, ArenaInfo().SetDeviceMemory()); + } #ifdef AMREX_USE_GPU if (the_arena_init_size <= 0) { #ifdef AMREX_USE_DPCPP diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H index 6b2154542f1..f53ad13429c 100644 --- a/Src/Base/AMReX_Array.H +++ b/Src/Base/AMReX_Array.H @@ -39,25 +39,29 @@ namespace amrex { T& operator [] (int i) noexcept { return arr[i]; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - const T* data () const noexcept { return arr; }; + const T* data () const noexcept { return arr; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - T* data () noexcept { return arr; }; + T* data () noexcept { return arr; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - std::size_t size() const noexcept { return N; }; + std::size_t size() const noexcept { return N; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - const T* begin() const noexcept { return arr; }; + const T* begin() const noexcept { return arr; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - const T* end() const noexcept { return arr + N; }; + const T* end() const noexcept { return arr + N; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - T* begin() noexcept { return arr; }; + T* begin() noexcept { return arr; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - T* end() noexcept { return arr + N; }; + T* end() noexcept { return arr + N; } + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void fill( const T& value ) noexcept + { for (std::size_t i = 0; i < N; ++i) arr[i] = value; } T arr[amrex::max(N,std::size_t{1})]; }; @@ -75,11 +79,13 @@ namespace amrex { { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i) const noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI); return arr[i-XLO]; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i) noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI); return arr[i-XLO]; } @@ -94,6 +100,7 @@ namespace amrex { typename std::enable_if::value,int>::type=0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[i+j*(XHI-XLO+1)-(YLO*(XHI-XLO+1)+XLO)]; } @@ -101,6 +108,7 @@ namespace amrex { typename std::enable_if::value,int>::type=0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[i+j*(XHI-XLO+1)-(YLO*(XHI-XLO+1)+XLO)]; } @@ -108,6 +116,7 @@ namespace amrex { typename std::enable_if::value,int>::type=0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[j+i*(YHI-YLO+1)-(XLO*(YHI-YLO+1)+YLO)]; } @@ -115,6 +124,7 @@ namespace amrex { typename std::enable_if::value,int>::type=0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { + AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[j+i*(YHI-YLO+1)-(XLO*(YHI-YLO+1)+YLO)]; } @@ -166,31 +176,31 @@ namespace amrex template std::array GetArrOfPtrs (std::array& a) noexcept { - return {AMREX_D_DECL(&a[0], &a[1], &a[2])}; + return {{AMREX_D_DECL(&a[0], &a[1], &a[2])}}; } template std::array GetArrOfPtrs (const std::array,AMREX_SPACEDIM>& a) noexcept { - return {AMREX_D_DECL(a[0].get(), a[1].get(), a[2].get())}; + return {{AMREX_D_DECL(a[0].get(), a[1].get(), a[2].get())}}; } - + template std::array GetArrOfConstPtrs (const std::array& a) noexcept { - return {AMREX_D_DECL(&a[0], &a[1], &a[2])}; + return {{AMREX_D_DECL(&a[0], &a[1], &a[2])}}; } template std::array GetArrOfConstPtrs (const std::array& a) noexcept { - return {AMREX_D_DECL(a[0], a[1], a[2])}; + return {{AMREX_D_DECL(a[0], a[1], a[2])}}; } template std::array GetArrOfConstPtrs (const std::array,AMREX_SPACEDIM>& a) noexcept { - return {AMREX_D_DECL(a[0].get(), a[1].get(), a[2].get())}; + return {{AMREX_D_DECL(a[0].get(), a[1].get(), a[2].get())}}; } } @@ -210,4 +220,3 @@ namespace amrex } #endif - diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H index a23fd8ed1ff..3fd9ad9c62a 100644 --- a/Src/Base/AMReX_Array4.H +++ b/Src/Base/AMReX_Array4.H @@ -10,17 +10,17 @@ namespace amrex { struct Array4 { T* AMREX_RESTRICT p; - Long jstride; - Long kstride; - Long nstride; - Dim3 begin; - Dim3 end; // end is hi + 1 - int ncomp; + Long jstride = 0; + Long kstride = 0; + Long nstride = 0; + Dim3 begin{1,1,1}; + Dim3 end{0,0,0}; // end is hi + 1 + int ncomp=0; AMREX_GPU_HOST_DEVICE constexpr Array4 () noexcept : p(nullptr) {} - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4::type> const& rhs) noexcept : p(rhs.p), @@ -44,9 +44,9 @@ namespace amrex { {} template ::type, - typename std::remove_const::type>::value>::type > + typename std::remove_const::type>::value,int>::type = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4 const& rhs, int start_comp) noexcept : p((T*)(rhs.p+start_comp*rhs.nstride)), @@ -61,7 +61,7 @@ namespace amrex { AMREX_GPU_HOST_DEVICE explicit operator bool() const noexcept { return p != nullptr; } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -70,7 +70,7 @@ namespace amrex { return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride]; } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k, int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -79,7 +79,7 @@ namespace amrex { return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride]; } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -88,7 +88,7 @@ namespace amrex { return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride); } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (int i, int j, int k, int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) @@ -97,7 +97,7 @@ namespace amrex { return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride); } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (IntVect const& iv) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -109,7 +109,7 @@ namespace amrex { #endif } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (IntVect const& iv, int n) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -121,7 +121,7 @@ namespace amrex { #endif } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (IntVect const& iv) const noexcept { #if (AMREX_SPACEDIM == 1) @@ -133,7 +133,7 @@ namespace amrex { #endif } - template ::value>::type > + template ::value,int>::type = 0> AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T* ptr (IntVect const& iv, int n) const noexcept { #if (AMREX_SPACEDIM == 1) diff --git a/Src/Base/AMReX_AsyncOut.H b/Src/Base/AMReX_AsyncOut.H index ed02e2f799e..523d6027df7 100644 --- a/Src/Base/AMReX_AsyncOut.H +++ b/Src/Base/AMReX_AsyncOut.H @@ -1,7 +1,6 @@ #ifndef AMREX_ASYNCOUT_H_ #define AMREX_ASYNCOUT_H_ -#include #include namespace amrex { diff --git a/Src/Base/AMReX_AsyncOut.cpp b/Src/Base/AMReX_AsyncOut.cpp index 66aa3594680..4d42552f9ad 100644 --- a/Src/Base/AMReX_AsyncOut.cpp +++ b/Src/Base/AMReX_AsyncOut.cpp @@ -1,49 +1,36 @@ #include +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include namespace amrex { namespace AsyncOut { namespace { +#if defined(AMREX_USE_DPCPP) || defined(AMREX_USE_HIP) +int s_asyncout = true; // Have this on by default for DPC++ for now so that + // I/O writing plotfile does not depend on unified + // memory. +#else int s_asyncout = false; +#endif int s_noutfiles = 64; MPI_Comm s_comm = MPI_COMM_NULL; -std::unique_ptr s_thread; -std::mutex s_mutx; -std::condition_variable s_cond; -static std::queue > s_func; -static bool s_finalizing = false; +std::unique_ptr s_thread; WriteInfo s_info; -void do_job () -{ - while (true) - { - std::unique_lock lck(s_mutx); - s_cond.wait(lck, [] () -> bool { return not s_func.empty(); }); - auto f = s_func.front(); - s_func.pop(); - lck.unlock(); - f(); - if (s_finalizing) break; - } -} - } void Initialize () { + amrex::ignore_unused(s_comm,s_info); + ParmParse pp("amrex"); pp.query("async_out", s_asyncout); pp.query("async_out_nfiles", s_noutfiles); @@ -51,19 +38,24 @@ void Initialize () int nprocs = ParallelDescriptor::NProcs(); s_noutfiles = std::min(s_noutfiles, nprocs); +#ifdef AMREX_USE_MPI if (s_asyncout and s_noutfiles < nprocs) { -#ifdef AMREX_MPI_THREAD_MULTIPLE + int provided = -1; + MPI_Query_thread(&provided); + if (provided < MPI_THREAD_MULTIPLE) + amrex::Abort("AsyncOut with " + std::to_string(s_noutfiles) + " and " + + std::to_string(nprocs) + " processes requires " + + "MPI_THREAD_MULTIPLE at runtime, but got " + + ParallelDescriptor::mpi_level_to_string(provided)); + int myproc = ParallelDescriptor::MyProc(); s_info = GetWriteInfo(myproc); MPI_Comm_split(ParallelDescriptor::Communicator(), s_info.ifile, myproc, &s_comm); -#else - amrex::Abort("AsyncOut with " + std::to_string(s_noutfiles) + " and " - +std::to_string(nprocs) + " processes requires MPI_THREAD_MULTIPLE"); -#endif } +#endif - if (s_asyncout) s_thread.reset(new std::thread(do_job)); + if (s_asyncout) s_thread.reset(new BackgroundThread()); ExecOnFinalize(Finalize); } @@ -71,8 +63,6 @@ void Initialize () void Finalize () { if (s_thread) { - Submit([] () { s_finalizing = true; }); - s_thread->join(); s_thread.reset(); } @@ -109,25 +99,17 @@ WriteInfo GetWriteInfo (int rank) void Submit (std::function&& a_f) { - std::lock_guard lck(s_mutx); - s_func.emplace(std::move(a_f)); - s_cond.notify_one(); + s_thread->Submit(std::move(a_f)); } void Submit (std::function const& a_f) { - std::lock_guard lck(s_mutx); - s_func.emplace(a_f); - s_cond.notify_one(); + s_thread->Submit(a_f); } void Finish () { - if (s_thread) { - Submit([] () { s_finalizing = true; }); - s_thread->join(); - s_thread.reset(new std::thread(do_job)); - } + s_thread->Finish(); } void Wait () diff --git a/Src/Base/AMReX_BCUtil.cpp b/Src/Base/AMReX_BCUtil.cpp index 033e81fc71a..3a950d29d4f 100644 --- a/Src/Base/AMReX_BCUtil.cpp +++ b/Src/Base/AMReX_BCUtil.cpp @@ -6,11 +6,11 @@ namespace amrex namespace { -void dummy_cpu_fill_extdir (Box const& bx, Array4 const& dest, - const int dcomp, const int numcomp, - GeometryData const& geom, const Real time, - const BCRec* bcr, const int bcomp, - const int orig_comp) +void dummy_cpu_fill_extdir (Box const& /*bx*/, Array4 const& /*dest*/, + const int /*dcomp*/, const int /*numcomp*/, + GeometryData const& /*geom*/, const Real /*time*/, + const BCRec* /*bcr*/, const int /*bcomp*/, + const int /*orig_comp*/) { // do something for external Dirichlet (BCType::ext_dir) if there are } @@ -18,11 +18,11 @@ void dummy_cpu_fill_extdir (Box const& bx, Array4 const& dest, struct dummy_gpu_fill_extdir { AMREX_GPU_DEVICE - void operator() (const IntVect& iv, Array4 const& dest, - const int dcomp, const int numcomp, - GeometryData const& geom, const Real time, - const BCRec* bcr, const int bcomp, - const int orig_comp) const + void operator() (const IntVect& /*iv*/, Array4 const& /*dest*/, + const int /*dcomp*/, const int /*numcomp*/, + GeometryData const& /*geom*/, const Real /*time*/, + const BCRec* /*bcr*/, const int /*bcomp*/, + const int /*orig_comp*/) const { // do something for external Dirichlet (BCType::ext_dir) if there are } diff --git a/Src/Base/AMReX_BLBackTrace.H b/Src/Base/AMReX_BLBackTrace.H index ab2335e133a..df9029ceff1 100644 --- a/Src/Base/AMReX_BLBackTrace.H +++ b/Src/Base/AMReX_BLBackTrace.H @@ -5,31 +5,17 @@ #include #include #include - #include #include -#include -#if defined(__APPLE__) -#include -#endif - -#include -#include - #ifdef _OPENMP #include #endif -#ifdef AMREX_BACKTRACING #define BL_PASTE2(x, y) x##y #define BL_PASTE(x, y) BL_PASTE2(x, y) #define BL_BACKTRACE_PUSH( S ) amrex::BLBTer BL_PASTE( bl_bter, __COUNTER__ )( S, __FILE__, __LINE__ ) #define BL_BACKTRACE_POP() amrex::BLBackTrace::bt_stack.pop() -#else -#define BL_BACKTRACE_PUSH(S) ((void)0) -#define BL_BACKTRACE_POP(S) ((void)0) -#endif namespace amrex { @@ -41,16 +27,13 @@ struct BLBackTrace //! Non-abort backtrace. Prints to specified file and continues. static void print_backtrace_info (const std::string& filename); -#ifdef AMREX_BACKTRACING static std::stack > bt_stack; // threadprivate here doesn't work with Cray and Intel -#if defined(_OPENMP) && !defined(_CRAYC) && !defined(__INTEL_COMPILER) +#if defined(_OPENMP) && !defined(_CRAYC) && !defined(__INTEL_COMPILER) && !defined(__PGI) #pragma omp threadprivate(bt_stack) #endif -#endif // AMREX_BACKTRACING }; -#ifdef AMREX_BACKTRACING class BLBTer { public: @@ -60,7 +43,6 @@ private: std::string line_file; void pop_bt_stack(); }; -#endif // AMREX_BACKTRACING } diff --git a/Src/Base/AMReX_BLBackTrace.cpp b/Src/Base/AMReX_BLBackTrace.cpp index 77ccba2bbd7..e78cf7549e9 100644 --- a/Src/Base/AMReX_BLBackTrace.cpp +++ b/Src/Base/AMReX_BLBackTrace.cpp @@ -1,22 +1,23 @@ -#include -#include -#include -#include -#include - -#include - #include #include #include #include #include #include +#include #ifdef AMREX_TINY_PROFILING #include #endif +#include +#include +#include +#include +#include +#include +#include + #if defined(AMREX_EXPORT_DYNAMIC) && defined(__APPLE__) #include #include @@ -25,11 +26,14 @@ #define AMREX_BACKTRACE_SUPPORTED 1 #endif +#ifndef _WIN32 +#include +#include +#endif + namespace amrex { -#ifdef AMREX_BACKTRACING std::stack > BLBackTrace::bt_stack; -#endif void BLBackTrace::handler(int s) @@ -78,7 +82,6 @@ BLBackTrace::handler(int s) amrex::ErrorStream() << "See " << errfilename << " file for details" << std::endl; -#ifdef AMREX_BACKTRACING if (!bt_stack.empty()) { std::ofstream errfile; errfile.open(errfilename.c_str(), std::ofstream::out | std::ofstream::app); @@ -92,7 +95,6 @@ BLBackTrace::handler(int s) errfile << std::endl; } } -#endif #ifdef AMREX_TINY_PROFILING { @@ -107,7 +109,7 @@ BLBackTrace::handler(int s) #endif if (ParallelDescriptor::NProcs() > 1) { - sleep(3); + amrex::Sleep(3); } #endif @@ -292,8 +294,6 @@ BLBackTrace::print_backtrace_info (FILE* f) #endif } -#ifdef AMREX_BACKTRACING - BLBTer::BLBTer(const std::string& s, const char* file, int line) { std::ostringstream ss; @@ -353,6 +353,4 @@ BLBTer::pop_bt_stack() } } -#endif - } diff --git a/Src/Base/AMReX_BLFort.H b/Src/Base/AMReX_BLFort.H index b2dc6a41f50..b7311f7fe9e 100644 --- a/Src/Base/AMReX_BLFort.H +++ b/Src/Base/AMReX_BLFort.H @@ -110,7 +110,7 @@ // Define macros for doing reductions. -#ifdef AMREX_USE_GPU +#if defined(AMREX_USE_GPU) && defined(AMREX_USE_GPU_PRAGMA) #define AMREX_MFITER_REDUCE_SUM(var) mfi.add_reduce_value(var, amrex::MFIter::SUM) #define AMREX_MFITER_REDUCE_MIN(var) mfi.add_reduce_value(var, amrex::MFIter::MIN) #define AMREX_MFITER_REDUCE_MAX(var) mfi.add_reduce_value(var, amrex::MFIter::MAX) diff --git a/Src/Base/AMReX_BLProfiler.H b/Src/Base/AMReX_BLProfiler.H index cb4eff409b2..e242751018e 100644 --- a/Src/Base/AMReX_BLProfiler.H +++ b/Src/Base/AMReX_BLProfiler.H @@ -525,17 +525,17 @@ namespace amrex { class BLProfiler { public: - explicit BLProfiler(const std::string &funcname) { } + explicit BLProfiler(const std::string &/*funcname*/) { } static void Initialize() { } static void InitParams() { } static void Finalize() { } - static void WriteStats(std::ostream &os) { } + static void WriteStats(std::ostream &/*os*/) { } static void WriteCommStats() { } void start() { } void stop() { } - static void InitParams(const Real ptl, const bool writeall, - const bool writefabs) { } - static void AddStep(const int snum) { } + static void InitParams(const Real /*ptl*/, const bool /*writeall*/, + const bool /*writefabs*/) { } + static void AddStep(const int /*snum*/) { } }; } diff --git a/Src/Base/AMReX_BLProfiler.cpp b/Src/Base/AMReX_BLProfiler.cpp index 8ba85b83477..21bfd2a2faa 100644 --- a/Src/Base/AMReX_BLProfiler.cpp +++ b/Src/Base/AMReX_BLProfiler.cpp @@ -513,11 +513,13 @@ namespace BLProfilerUtils { void WriteHeader(std::ostream &ios, const int colWidth, const Real maxlen, const bool bwriteavg) { + int maxlenI = int(maxlen); + if(bwriteavg) { - ios << std::setfill('-') << std::setw(maxlen+4 + 7 * (colWidth+2)) + ios << std::setfill('-') << std::setw(maxlenI+4 + 7 * (colWidth+2)) << std::left << "Total times " << '\n'; ios << std::right << std::setfill(' '); - ios << std::setw(maxlen + 2) << "Function Name" + ios << std::setw(maxlenI + 2) << "Function Name" << std::setw(colWidth + 2) << "NCalls" << std::setw(colWidth + 2) << "Min" << std::setw(colWidth + 2) << "Avg" @@ -527,10 +529,10 @@ void WriteHeader(std::ostream &ios, const int colWidth, << std::setw(colWidth + 4) << "Percent %" << '\n'; } else { - ios << std::setfill('-') << std::setw(maxlen+4 + 3 * (colWidth+2)) + ios << std::setfill('-') << std::setw(maxlenI+4 + 3 * (colWidth+2)) << std::left << "Total times " << '\n'; ios << std::right << std::setfill(' '); - ios << std::setw(maxlen + 2) << "Function Name" + ios << std::setw(maxlenI + 2) << "Function Name" << std::setw(colWidth + 2) << "NCalls" << std::setw(colWidth + 2) << "Time" << std::setw(colWidth + 4) << "Percent %" @@ -544,6 +546,7 @@ void WriteRow(std::ostream &ios, const std::string &fname, const int colWidth, const Real maxlen, const bool bwriteavg) { + int maxlenI = int(maxlen); int numPrec(4), pctPrec(2); Real stdDev(0.0), coeffVariation(0.0); if(pstats.variance > 0.0) { @@ -555,7 +558,7 @@ void WriteRow(std::ostream &ios, const std::string &fname, if(bwriteavg) { ios << std::right; - ios << std::setw(maxlen + 2) << fname << " " + ios << std::setw(maxlenI + 2) << fname << " " << std::setw(colWidth) << pstats.nCalls << " " << std::setprecision(numPrec) << std::fixed << std::setw(colWidth) << pstats.minTime << " " @@ -570,7 +573,7 @@ void WriteRow(std::ostream &ios, const std::string &fname, << std::setprecision(pctPrec) << std::fixed << std::setw(colWidth) << percent << " %" << '\n'; } else { - ios << std::setw(maxlen + 2) << fname << " " + ios << std::setw(maxlenI + 2) << fname << " " << std::setw(colWidth) << pstats.nCalls << " " << std::setprecision(numPrec) << std::fixed << std::setw(colWidth) << pstats.totalTime << " " @@ -770,7 +773,7 @@ void WriteStats(std::ostream &ios, } // end namespace BLProfilerUtils void BLProfiler::WriteBaseProfile(bool bFlushing, bool memCheck) { // ---- write basic profiling data - + amrex::ignore_unused(memCheck); // --------------------------------------- gather global stats Real baseProfStart(amrex::second()); // time the timer const int nProcs(ParallelDescriptor::NProcs()); @@ -1162,8 +1165,9 @@ void BLProfiler::WriteCallTrace(bool bFlushing, bool memCheck) { // ---- write -void BLProfiler::WriteCommStats(bool bFlushing, bool memCheck) { - +void BLProfiler::WriteCommStats(bool bFlushing, bool memCheck) +{ + amrex::ignore_unused(bFlushing); Real wcsStart(amrex::second()); bool bAllCFTypesExcluded(OnExcludeList(AllCFTypes)); if( ! bAllCFTypesExcluded) { @@ -1461,6 +1465,7 @@ void BLProfiler::AddAllReduce(const CommFuncType cft, const int size, void BLProfiler::AddWait(const CommFuncType cft, const MPI_Request &req, const MPI_Status &status, const bool beforecall) { + amrex::ignore_unused(req); #ifdef BL_USE_MPI if(OnExcludeList(cft)) { return; @@ -1481,6 +1486,7 @@ void BLProfiler::AddWaitsome(const CommFuncType cft, const Vector & const int completed, const Vector &status, const bool beforecall) { + amrex::ignore_unused(reqs); #ifdef BL_USE_MPI if(OnExcludeList(cft)) { return; @@ -1658,28 +1664,28 @@ namespace amrex { BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTART_CPP,bl_proffortfuncstart_cpp) ( - const int istr[], const int *NSTR + const int /*istr*/[], const int * /*NSTR*/ ) { } BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTOP_CPP,bl_proffortfuncstop_cpp) ( - const int istr[], const int *NSTR + const int /*istr*/[], const int * /*NSTR*/ ) { } BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTART_CPP_INT,bl_proffortfuncstart_cpp_int) ( - int i + int /*i*/ ) { } BL_FORT_PROC_DECL(BL_PROFFORTFUNCSTOP_CPP_INT,bl_proffortfuncstop_cpp_int) ( - int i + int /*i*/ ) { } diff --git a/Src/Base/AMReX_BackgroundThread.H b/Src/Base/AMReX_BackgroundThread.H new file mode 100644 index 00000000000..33283c60d49 --- /dev/null +++ b/Src/Base/AMReX_BackgroundThread.H @@ -0,0 +1,39 @@ +#ifndef AMREX_BACKGROUND_THREAD_H_ +#define AMREX_BACKGROUND_THREAD_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace amrex { + +class BackgroundThread +{ +public: + BackgroundThread (); + ~BackgroundThread (); + + void Submit (std::function&& a_f); + void Submit (std::function const& a_f); + + void Finish (); // Not required. Call this if you want all jobs to finish. + +private: + void do_job (); + + std::unique_ptr m_thread; + std::mutex m_mutx; + std::condition_variable m_job_cond; + std::condition_variable m_done_cond; + std::queue > m_func; + bool m_clearing = false; + bool m_finalizing = false; +}; + +} + +#endif diff --git a/Src/Base/AMReX_BackgroundThread.cpp b/Src/Base/AMReX_BackgroundThread.cpp new file mode 100644 index 00000000000..31a2568d337 --- /dev/null +++ b/Src/Base/AMReX_BackgroundThread.cpp @@ -0,0 +1,63 @@ +#include + +namespace amrex { + +BackgroundThread::BackgroundThread () +{ + m_thread.reset(new std::thread(&BackgroundThread::do_job, this)); +} + +BackgroundThread::~BackgroundThread () +{ + if (m_thread) { + Submit([this] () { m_finalizing = true; }); + m_thread->join(); + m_thread.reset(); + } +} + +void BackgroundThread::do_job () +{ + while (true) + { + std::unique_lock lck(m_mutx); + m_job_cond.wait(lck, [this] () -> bool { return !m_func.empty(); }); + auto f = m_func.front(); + m_func.pop(); + lck.unlock(); + f(); + if (m_clearing) { // All jobs before this have finished. + m_done_cond.notify_one(); + } + if (m_finalizing) { + break; + } + } +} + +void BackgroundThread::Submit (std::function&& a_f) +{ + std::lock_guard lck(m_mutx); + m_func.emplace(std::move(a_f)); + m_job_cond.notify_one(); +} + +void BackgroundThread::Submit (std::function const& a_f) +{ + std::lock_guard lck(m_mutx); + m_func.emplace(a_f); + m_job_cond.notify_one(); +} + +void BackgroundThread::Finish () +{ + if (m_thread) { + Submit([this] () { m_clearing = true; }); + std::unique_lock lck(m_mutx); + m_done_cond.wait(lck, [this] () -> bool { return m_func.empty(); }); + m_clearing = false; + lck.unlock(); + } +} + +} diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index d9660dcf635..b33884ba933 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -33,11 +33,6 @@ #include #include -#ifdef USE_PERILLA -#include -#include -#endif - namespace amrex { @@ -278,7 +273,7 @@ public: */ void resize (const Box& b, int N = 1); - template ::value>::type > + template ::value,int>::type = 0> Elixir elixir () noexcept; /** @@ -298,7 +293,7 @@ public: } //! Returns bytes used in the Box for those components - std::size_t nBytes (const Box& bx, int start_comp, int ncomps) const noexcept + std::size_t nBytes (const Box& bx, int ncomps) const noexcept { return bx.numPts() * sizeof(T) * ncomps; } //! Returns the number of components @@ -1610,15 +1605,6 @@ protected: Long truesize = 0L; //!< nvar*numpts that was allocated on heap. bool ptr_owner = false; //!< Owner of T*? bool shared_memory = false; //!< Is the memory allocated in shared memory? - -#ifdef USE_PERILLA -public: - LocalConnection l_con; - RemoteConnection r_con; - bool fireable; - int padding[1024]; -#endif - }; template @@ -1659,12 +1645,14 @@ BaseFab::prefetchToHost () const noexcept // auto& q = Gpu::Device::streamQueue(); // q.submit([&] (sycl::handler& h) { h.prefetch(this->dptr, s); }); #elif defined(AMREX_USE_CUDA) - std::size_t s = sizeof(T)*this->nvar*this->domain.numPts(); - AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, - cudaCpuDeviceId, - Gpu::gpuStream())); + if (Gpu::Device::devicePropMajor() >= 6) { + std::size_t s = sizeof(T)*this->nvar*this->domain.numPts(); + AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, + cudaCpuDeviceId, + Gpu::gpuStream())); + } #elif defined(AMREX_USE_HIP) - // HIP FIX HERE + // xxxxx HIP FIX HERE after managed memory is supported #endif } #endif @@ -1681,12 +1669,14 @@ BaseFab::prefetchToDevice () const noexcept auto& q = Gpu::Device::streamQueue(); q.submit([&] (sycl::handler& h) { h.prefetch(this->dptr, s); }); #elif defined(AMREX_USE_CUDA) - std::size_t s = sizeof(T)*this->nvar*this->domain.numPts(); - AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, - Gpu::Device::deviceId(), - Gpu::gpuStream())); + if (Gpu::Device::devicePropMajor() >= 6) { + std::size_t s = sizeof(T)*this->nvar*this->domain.numPts(); + AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, + Gpu::Device::deviceId(), + Gpu::gpuStream())); + } #elif defined(AMREX_USE_HIP) - // HIP FIX HERE + // xxxxx HIP FIX HERE after managed memory is supported #endif } #endif @@ -2036,7 +2026,7 @@ BaseFab::resize (const Box& b, int n) } template -template +template ::value,int>::type> Elixir BaseFab::elixir () noexcept { @@ -2470,7 +2460,7 @@ BaseFab::maxabs (const Box& subbox, int comp) const noexcept #endif { T r = 0; - amrex::Loop(subbox, [=,&r] (int i, int j, int k) AMREX_NOEXCEPT + amrex::Loop(subbox, [=,&r] (int i, int j, int k) noexcept { r = amrex::max(r, amrex::Math::abs(a(i,j,k))); }); @@ -3943,7 +3933,7 @@ BaseFab::sum (const Box& bx, DestComp dcomp, NumComps ncomp) const noexcept } else #endif { - amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) AMREX_NOEXCEPT + amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) noexcept { r += a(i,j,k,n+dcomp.i); }); @@ -3983,7 +3973,7 @@ BaseFab::dot (const BaseFab& src, const Box& bx, SrcComp scomp, DestComp d } else #endif { - amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) AMREX_NOEXCEPT + amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) noexcept { r += d(i,j,k,n+dcomp.i) * s(i,j,k,n+scomp.i); }); @@ -4029,7 +4019,7 @@ BaseFab::dot (const Box& bx, DestComp dcomp, NumComps ncomp) const noexcept } else #endif { - amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) AMREX_NOEXCEPT + amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) noexcept { r += a(i,j,k,n+dcomp.i)*a(i,j,k,n+dcomp.i); }); @@ -4073,7 +4063,7 @@ BaseFab::dotmask (const BaseFab& src, const Box& bx, const BaseFab& m } else #endif { - amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) AMREX_NOEXCEPT + amrex::LoopOnCpu(bx, ncomp.n, [=,&r] (int i, int j, int k, int n) noexcept { int mi = static_cast(static_cast(m(i,j,k))); r += d(i,j,k,n+dcomp.i)*s(i,j,k,n+scomp.i)*mi; diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index 2fba229ea29..73f68960685 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -9,11 +9,11 @@ template AMREX_GPU_HOST_DEVICE void cast (BaseFab& tofab, BaseFab const& fromfab, - Box const& bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) AMREX_NOEXCEPT + Box const& bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept { auto const& tdata = tofab.array(); auto const& fdata = fromfab.const_array(); - amrex::LoopConcurrent(bx, ncomp.n, [=] (int i, int j, int k, int n) AMREX_NOEXCEPT + amrex::LoopConcurrent(bx, ncomp.n, [=] (int i, int j, int k, int n) noexcept { tdata(i,j,k,n+dcomp.i) = static_cast(fdata(i,j,k,n+scomp.i)); }); diff --git a/Src/Base/AMReX_BlockMutex.H b/Src/Base/AMReX_BlockMutex.H index 8339bfce8a0..b00c55e797f 100644 --- a/Src/Base/AMReX_BlockMutex.H +++ b/Src/Base/AMReX_BlockMutex.H @@ -11,7 +11,7 @@ struct BlockMutex { union state_t { - struct { int blockid; int count; }; + struct II { int blockid; int count; } data; unsigned long long ull; }; @@ -32,6 +32,7 @@ struct BlockMutex void lock (int i) noexcept { #ifdef AMREX_USE_DPCPP // xxxxx DPCPP todo + amrex::ignore_unused(i); #else int blockid = blockIdx.z*blockDim.x*blockDim.y + blockIdx.y*blockDim.x + blockIdx.x; state_t old = m_state[i]; @@ -39,13 +40,13 @@ struct BlockMutex do { assumed = old; state_t val; - val.blockid = blockid; - if (assumed.blockid == blockid) { + val.data.blockid = blockid; + if (assumed.data.blockid == blockid) { // Already locked by another thread in this block. Need to ++count. - val.count = assumed.count + 1; + val.data.count = assumed.data.count + 1; } else { // Currently unlocked or locked by another block. Need to lock. - val.count = 1; + val.data.count = 1; assumed = FreeState(); } old.ull = atomicCAS((unsigned long long*)(m_state+i), assumed.ull, val.ull); @@ -57,19 +58,20 @@ struct BlockMutex void unlock (int i) noexcept { #ifdef AMREX_USE_DPCPP // xxxxx DPCPP todo + amrex::ignore_unused(i); #else state_t old = m_state[i]; state_t assumed; do { assumed = old; state_t val; - if (assumed.count == 1) { + if (assumed.data.count == 1) { // Need to unlock val = FreeState(); } else { // --count, but do NOT unlock val = assumed; - --val.count; + --val.data.count; } old.ull = atomicCAS((unsigned long long*)(m_state+i), assumed.ull, val.ull); } while (assumed.ull != old.ull); diff --git a/Src/Base/AMReX_BlockMutex.cpp b/Src/Base/AMReX_BlockMutex.cpp index e1bdf5d75bb..e16892fec17 100644 --- a/Src/Base/AMReX_BlockMutex.cpp +++ b/Src/Base/AMReX_BlockMutex.cpp @@ -6,6 +6,7 @@ namespace amrex { void BlockMutex::init_states (state_t* state, int N) noexcept { #ifdef AMREX_USE_DPCPP + amrex::ignore_unused(state,N); amrex::Abort("xxxxx DPCPP todo"); #else amrex::launch((N+255)/256, 256, Gpu::nullStream(), diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index 3a06cdccb6b..2927288e5e3 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -152,33 +152,33 @@ public: AMREX_GPU_HOST_DEVICE GpuArray length3d () const noexcept { #if (AMREX_SPACEDIM == 1) - return {bigend[0]-smallend[0]+1, 1, 1}; + return {{bigend[0]-smallend[0]+1, 1, 1}}; #elif (AMREX_SPACEDIM == 2) - return {bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, 1}; + return {{bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, 1}}; #elif (AMREX_SPACEDIM == 3) - return {bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, bigend[2]-smallend[2]+1}; + return {{bigend[0]-smallend[0]+1, bigend[1]-smallend[1]+1, bigend[2]-smallend[2]+1}}; #endif } AMREX_GPU_HOST_DEVICE GpuArray loVect3d () const noexcept { #if (AMREX_SPACEDIM == 1) - return {smallend[0], 0, 0}; + return {{smallend[0], 0, 0}}; #elif (AMREX_SPACEDIM == 2) - return {smallend[0], smallend[1], 0}; + return {{smallend[0], smallend[1], 0}}; #elif (AMREX_SPACEDIM == 3) - return {smallend[0], smallend[1], smallend[2]}; + return {{smallend[0], smallend[1], smallend[2]}}; #endif } AMREX_GPU_HOST_DEVICE GpuArray hiVect3d () const noexcept { #if (AMREX_SPACEDIM == 1) - return {bigend[0], 0, 0}; + return {{bigend[0], 0, 0}}; #elif (AMREX_SPACEDIM == 2) - return {bigend[0], bigend[1], 0}; + return {{bigend[0], bigend[1], 0}}; #elif (AMREX_SPACEDIM == 3) - return {bigend[0], bigend[1], bigend[2]}; + return {{bigend[0], bigend[1], bigend[2]}}; #endif } @@ -473,6 +473,9 @@ public: AMREX_GPU_HOST_DEVICE Box& surroundingNodes (int dir) noexcept; + AMREX_GPU_HOST_DEVICE + Box& surroundingNodes (Direction d) noexcept { return surroundingNodes(static_cast(d)); } + //! Convert to CELL type in all directions. AMREX_GPU_HOST_DEVICE Box& enclosedCells () noexcept; @@ -481,6 +484,9 @@ public: AMREX_GPU_HOST_DEVICE Box& enclosedCells (int dir) noexcept; + AMREX_GPU_HOST_DEVICE + Box& enclosedCells (Direction d) noexcept { return enclosedCells(static_cast(d)); } + /** * \brief Return Box that is intersection of this Box * and argument. The Boxes MUST be of same type. @@ -498,17 +504,6 @@ public: return *this; } - //! for serialization - static size_t linearSize() noexcept - { - size_t retval = 2*IntVect::linearSize(); - return retval; - } - - //! for serialization - void linearOut(void* a_buffer ) const noexcept; - void linearIn(void* a_buffer ) noexcept; - /** * \brief Modify Box to that of the minimum Box containing both * the original Box and the argument. @@ -572,6 +567,9 @@ public: AMREX_GPU_HOST_DEVICE Box& grow (int idir, int n_cell) noexcept { smallend.shift(idir, -n_cell); bigend.shift(idir, n_cell); return *this; } + AMREX_GPU_HOST_DEVICE + Box& grow (Direction d, int n_cell) noexcept { return grow(static_cast(d), n_cell); } + /** * \brief Grow the Box on the low end by n_cell cells in direction idir. * NOTE: n_cell negative shrinks the Box by that number of cells. @@ -579,6 +577,9 @@ public: AMREX_GPU_HOST_DEVICE Box& growLo (int idir, int n_cell = 1) noexcept { smallend.shift(idir, -n_cell); return *this; } + AMREX_GPU_HOST_DEVICE + Box& growLo (Direction d, int n_cell = 1) noexcept { return growLo(static_cast(d), n_cell); } + /** * \brief Grow the Box on the high end by n_cell cells in * direction idir. NOTE: n_cell negative shrinks the Box by that @@ -587,6 +588,9 @@ public: AMREX_GPU_HOST_DEVICE Box& growHi (int idir, int n_cell = 1) noexcept { bigend.shift(idir,n_cell); return *this; } + AMREX_GPU_HOST_DEVICE + Box& growHi (Direction d, int n_cell = 1) noexcept { return growHi(static_cast(d), n_cell); } + //! Grow in the direction of the given face. AMREX_GPU_HOST_DEVICE Box& grow (Orientation face, int n_cell = 1) noexcept { @@ -871,7 +875,6 @@ Box::atOffset (Long offset) const noexcept static_cast(k+lo[2])) }; } -// HIP FIX HERE - Initialization List: {{ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE GpuArray @@ -899,34 +902,6 @@ Box::setRange (int dir, return *this; } -inline -void -Box::linearOut(void* a_buffer ) const noexcept -{ - unsigned char* buf = (unsigned char*) a_buffer; - const IntVect& ivlo = this->smallEnd(); - const IntVect& ivhi = this->bigEnd(); - ivlo.linearOut(buf); - buf += ivlo.linearSize(); - ivhi.linearOut(buf); -// buf += ivhi.linearSize(); -} - -inline -void -Box::linearIn(void* a_buffer ) noexcept -{ - unsigned char* buf = (unsigned char*) a_buffer; - IntVect ivlo; - IntVect ivhi; - ivlo.linearIn(buf); - buf += ivlo.linearSize(); - ivhi.linearIn(buf); -// buf += ivhi.linearSize(); - - *this = Box(ivlo, ivhi); -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void @@ -1071,7 +1046,7 @@ public: inline BoxConverter::~BoxConverter () { } -void AllGatherBoxes (Vector& bxs); +void AllGatherBoxes (Vector& bxs, int n_extra_reserve=0); /** * \brief Grow Box in all directions by given amount. @@ -1107,6 +1082,13 @@ Box grow (const Box& b, int idir, int n_cell) noexcept return result; } +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +Box grow (const Box& b, Direction d, int n_cell) noexcept +{ + return grow(b, static_cast(d), n_cell); +} + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box growLo (const Box& b, int idir, int n_cell) noexcept @@ -1116,6 +1098,13 @@ Box growLo (const Box& b, int idir, int n_cell) noexcept return result; } +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +Box growLo (const Box& b, Direction d, int n_cell) noexcept +{ + return growLo(b, static_cast(d), n_cell); +} + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box growHi (const Box& b, int idir, int n_cell) noexcept @@ -1125,6 +1114,13 @@ Box growHi (const Box& b, int idir, int n_cell) noexcept return result; } +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +Box growHi (const Box& b, Direction d, int n_cell) noexcept +{ + return growHi(b, static_cast(d), n_cell); +} + /** * \brief Coarsen Box by given (positive) refinement ratio. * NOTE: if type(dir) = CELL centered: lo <- lo/ratio and @@ -1226,6 +1222,13 @@ Box surroundingNodes (const Box& b, int dir) noexcept return bx; } +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +Box surroundingNodes (const Box& b, Direction d) noexcept +{ + return surroundingNodes(b, static_cast(d)); +} + /** * \brief Returns a Box with NODE based coordinates in all * directions that encloses Box b. @@ -1273,6 +1276,13 @@ Box enclosedCells (const Box& b, int dir) noexcept return bx; } +AMREX_GPU_HOST_DEVICE +AMREX_FORCE_INLINE +Box enclosedCells (const Box& b, Direction d) noexcept +{ + return enclosedCells(b, static_cast(d)); +} + /** * \brief Returns a Box with CELL based coordinates in all * directions that is enclosed by b. diff --git a/Src/Base/AMReX_Box.cpp b/Src/Base/AMReX_Box.cpp index 5e6a92a65ac..b0d6423e032 100644 --- a/Src/Base/AMReX_Box.cpp +++ b/Src/Base/AMReX_Box.cpp @@ -104,19 +104,51 @@ BoxCommHelper::BoxCommHelper (const Box& bx, int* p_) } void -AllGatherBoxes (Vector& bxs) +AllGatherBoxes (Vector& bxs, int n_extra_reserve) { #ifdef BL_USE_MPI - // cell centered boxes only! - const auto szof_bx = Box::linearSize(); - const Long count = bxs.size() * static_cast(szof_bx); - const auto& countvec = ParallelDescriptor::Gather(count, ParallelDescriptor::IOProcessorNumber()); - +#if 0 + // In principle, MPI_Allgather/MPI_Allgatherv should not be slower than + // MPI_Gather/MPI_Gatherv followed by MPI_Bcast. But that's not true on Summit. + MPI_Comm comm = ParallelContext::CommunicatorSub(); + const int count = bxs.size(); + Vector countvec(ParallelContext::NProcsSub()); + MPI_Allgather(&count, 1, MPI_INT, countvec.data(), 1, MPI_INT, comm); + + Vector offset(countvec.size(),0); + Long count_tot = countvec[0]; + for (int i = 1, N = offset.size(); i < N; ++i) { + offset[i] = offset[i-1] + countvec[i-1]; + count_tot += countvec[i]; + } + + if (count_tot == 0) return; + + if (count_tot > static_cast(std::numeric_limits::max())) { + amrex::Abort("AllGatherBoxes: not many boxes"); + } + + Vector recv_buffer; + recv_buffer.reserve(count_tot+n_extra_reserve); + recv_buffer.resize(count_tot); + MPI_Allgatherv(bxs.data(), count, ParallelDescriptor::Mpi_typemap::type(), + recv_buffer.data(), countvec.data(), offset.data(), + ParallelDescriptor::Mpi_typemap::type(), comm); + + std::swap(bxs,recv_buffer); +#else + MPI_Comm comm = ParallelContext::CommunicatorSub(); + const int root = ParallelContext::IOProcessorNumberSub(); + const int myproc = ParallelContext::MyProcSub(); + const int nprocs = ParallelContext::NProcsSub(); + const int count = bxs.size(); + Vector countvec(nprocs); + MPI_Gather(&count, 1, MPI_INT, countvec.data(), 1, MPI_INT, root, comm); + Long count_tot = 0L; - Vector offset(countvec.size(),0L); - if (ParallelDescriptor::IOProcessor()) - { + Vector offset(countvec.size(),0); + if (myproc == root) { count_tot = countvec[0]; for (int i = 1, N = offset.size(); i < N; ++i) { offset[i] = offset[i-1] + countvec[i-1]; @@ -124,31 +156,28 @@ AllGatherBoxes (Vector& bxs) } } - ParallelDescriptor::Bcast(&count_tot, 1, ParallelDescriptor::IOProcessorNumber()); + MPI_Bcast(&count_tot, 1, MPI_INT, root, comm); if (count_tot == 0) return; - Vector send_buffer(count); - char* psend = (count > 0) ? send_buffer.data() : nullptr; - char* p = psend; - for (const auto& b : bxs) { - b.linearOut(p); - p += szof_bx; + if (count_tot > static_cast(std::numeric_limits::max())) { + amrex::Abort("AllGatherBoxes: not many boxes"); } - Vector recv_buffer(count_tot); - ParallelDescriptor::Gatherv(psend, count, recv_buffer.data(), countvec, offset, ParallelDescriptor::IOProcessorNumber()); - - ParallelDescriptor::Bcast(recv_buffer.data(), count_tot, ParallelDescriptor::IOProcessorNumber()); + Vector recv_buffer; + recv_buffer.reserve(count_tot+n_extra_reserve); + recv_buffer.resize(count_tot); + MPI_Gatherv(bxs.data(), count, ParallelDescriptor::Mpi_typemap::type(), + recv_buffer.data(), countvec.data(), offset.data(), + ParallelDescriptor::Mpi_typemap::type(), root, comm); + MPI_Bcast(recv_buffer.data(), count_tot, ParallelDescriptor::Mpi_typemap::type(), + root, comm); - const Long nboxes_tot = count_tot/szof_bx; - bxs.resize(nboxes_tot); + std::swap(bxs,recv_buffer); +#endif - p = recv_buffer.data(); - for (auto& b : bxs) { - b.linearIn(p); - p += szof_bx; - } +#else + amrex::ignore_unused(bxs,n_extra_reserve); #endif } diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H index 9f4d17ecbd3..7f3f8bfdd4d 100644 --- a/Src/Base/AMReX_BoxArray.H +++ b/Src/Base/AMReX_BoxArray.H @@ -516,6 +516,8 @@ using BndryBATransformer = BATransformer; */ class MFIter; +class AmrMesh; +class FabArrayBase; class BoxArray { @@ -551,6 +553,8 @@ public: explicit BoxArray (BoxList&& bl) noexcept; BoxArray (const BoxArray& rhs, const BATransformer& trans); + + BoxArray (BoxList&& bl, IntVect const& max_grid_size); /** * \brief Initialize the BoxArray from a single box. @@ -796,10 +800,16 @@ public: //! Make ourselves unique. void uniqify (); + friend class AmrMesh; + friend class FabArrayBase; + private: //! Update BoxArray index type according the box type, and then convert boxes to cell-centered. void type_update (); + BoxList const& simplified_list () const; // For regular AMR grids only + BoxArray simplified () const; + BARef::HashType& getHashMap () const; IntVect getDoiLo () const noexcept; @@ -808,6 +818,7 @@ private: BATransformer m_bat; //! The data -- a reference-counted pointer to a Ref. std::shared_ptr m_ref; + mutable std::shared_ptr m_simplified_list; }; //! Write a BoxArray to an ostream in ASCII format. diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp index 1ec8ed8e4f0..cf6ebb79a1c 100644 --- a/Src/Base/AMReX_BoxArray.cpp +++ b/Src/Base/AMReX_BoxArray.cpp @@ -10,9 +10,7 @@ #include #endif -#ifdef _OPENMP -#include -#endif +#include namespace amrex { @@ -280,7 +278,8 @@ BoxArray::BoxArray () BoxArray::BoxArray (const Box& bx) : m_bat(bx.ixType()), - m_ref(std::make_shared(amrex::enclosedCells(bx))) + m_ref(std::make_shared(amrex::enclosedCells(bx))), + m_simplified_list(std::make_shared(bx)) {} BoxArray::BoxArray (const BoxList& bl) @@ -327,15 +326,30 @@ BoxArray::BoxArray (const BoxArray& rhs, const BATransformer& trans) BoxArray::BoxArray (const BoxArray& rhs) : m_bat(rhs.m_bat), - m_ref(rhs.m_ref) + m_ref(rhs.m_ref), + m_simplified_list(rhs.m_simplified_list) {} +BoxArray::BoxArray (BoxList&& bl, IntVect const& max_grid_size) + : + m_bat(), + m_ref(std::make_shared()), + m_simplified_list(std::make_shared(std::move(bl))) +{ + BoxList tmpbl = *m_simplified_list; + tmpbl.maxSize(max_grid_size); + m_bat = BATransformer(tmpbl.ixType()); + m_ref->define(std::move(tmpbl)); + type_update(); +} + void BoxArray::define (const Box& bx) { clear(); m_bat = BATransformer(bx.ixType()); m_ref->define(amrex::enclosedCells(bx)); + m_simplified_list = std::make_shared(bx); } void @@ -361,6 +375,7 @@ BoxArray::clear () { m_bat = BATransformer(); m_ref.reset(new BARef()); + m_simplified_list.reset(); } void @@ -544,7 +559,11 @@ BoxArray::maxSize (const IntVect& block_size) blst.maxSize(block_size); const int N = blst.size(); if (size() != N) { // If size doesn't change, do nothing. + BoxList bak = (m_simplified_list) ? *m_simplified_list : BoxList(); define(std::move(blst)); + if (bak.isNotEmpty()) { + m_simplified_list = std::make_shared(std::move(bak)); + } } return *this; } @@ -1023,10 +1042,8 @@ BoxArray::minimalBox () const #pragma omp parallel #endif { -#ifndef _OPENMP - int tid = 0; -#else - int tid = omp_get_thread_num(); + int tid = OpenMP::get_thread_num(); +#ifdef _OPENMP #pragma omp for #endif for (int i = 0; i < N; ++i) { @@ -1075,10 +1092,8 @@ BoxArray::minimalBox (Long& npts_avg_box) const #pragma omp parallel reduction(+:npts_tot) #endif { -#ifndef _OPENMP - int tid = 0; -#else - int tid = omp_get_thread_num(); + int tid = OpenMP::get_thread_num(); +#ifdef _OPENMP #pragma omp for #endif for (int i = 0; i < N; ++i) { @@ -1260,99 +1275,87 @@ BoxArray::complementIn (BoxList& bl, const Box& bx) const bl.set(bx.ixType()); bl.push_back(bx); - if (!empty()) - { - BARef::HashType& BoxHashMap = getHashMap(); + if (empty()) return; - BL_ASSERT(bx.ixType() == ixType()); + BARef::HashType& BoxHashMap = getHashMap(); - Box gbx = bx; + BL_ASSERT(bx.ixType() == ixType()); - IntVect glo = gbx.smallEnd(); - IntVect ghi = gbx.bigEnd(); - const IntVect& doilo = getDoiLo(); - const IntVect& doihi = getDoiHi(); + Box gbx = bx; - gbx.setSmall(glo - doihi).setBig(ghi + doilo); - gbx.refine(crseRatio()).coarsen(m_ref->crsn); - - const IntVect& sm = amrex::max(gbx.smallEnd()-1, m_ref->bbox.smallEnd()); - const IntVect& bg = amrex::min(gbx.bigEnd(), m_ref->bbox.bigEnd()); + IntVect glo = gbx.smallEnd(); + IntVect ghi = gbx.bigEnd(); + const IntVect& doilo = getDoiLo(); + const IntVect& doihi = getDoiHi(); - Box cbx(sm,bg); - cbx.normalize(); + gbx.setSmall(glo - doihi).setBig(ghi + doilo); + gbx.refine(crseRatio()).coarsen(m_ref->crsn); - if (!cbx.intersects(m_ref->bbox)) return; + const IntVect& sm = amrex::max(gbx.smallEnd()-1, m_ref->bbox.smallEnd()); + const IntVect& bg = amrex::min(gbx.bigEnd(), m_ref->bbox.bigEnd()); - auto TheEnd = BoxHashMap.cend(); + Box cbx(sm,bg); + cbx.normalize(); - BoxList newbl(bl.ixType()); - newbl.reserve(bl.capacity()); - BoxList newdiff(bl.ixType()); + if (!cbx.intersects(m_ref->bbox)) return; - auto& abox = m_ref->m_abox; + auto TheEnd = BoxHashMap.cend(); - for (IntVect iv = cbx.smallEnd(), End = cbx.bigEnd(); - iv <= End && bl.isNotEmpty(); - cbx.next(iv)) + Vector intersect_boxes; + auto& abox = m_ref->m_abox; + if (m_bat.is_null()) { + AMREX_LOOP_3D(cbx, i, j, k, { - auto it = BoxHashMap.find(iv); - - if (it != TheEnd) - { - if (m_bat.is_null()) { - for (const int index : it->second) - { - const Box& ibox = abox[index]; - const Box& isect = bx & ibox; - - if (isect.ok()) - { - newbl.clear(); - for (const Box& b : bl) { - amrex::boxDiff(newdiff, b, isect); - newbl.join(newdiff); - } - bl.swap(newbl); - } + auto it = BoxHashMap.find(IntVect(AMREX_D_DECL(i,j,k))); + if (it != TheEnd) { + for (const int index : it->second) { + const Box& ibox = abox[index]; + if (bx.intersects(ibox)) { + intersect_boxes.push_back(ibox); } - } else if (m_bat.is_simple()) { - IndexType t = ixType(); - IntVect cr = crseRatio(); - for (const int index : it->second) - { - const Box& ibox = amrex::convert(amrex::coarsen(abox[index],cr),t); - const Box& isect = bx & ibox; - - if (isect.ok()) - { - newbl.clear(); - for (const Box& b : bl) { - amrex::boxDiff(newdiff, b, isect); - newbl.join(newdiff); - } - bl.swap(newbl); - } + } + } + }); + } else if (m_bat.is_simple()) { + IndexType t = ixType(); + IntVect cr = crseRatio(); + AMREX_LOOP_3D(cbx, i, j, k, + { + auto it = BoxHashMap.find(IntVect(AMREX_D_DECL(i,j,k))); + if (it != TheEnd) { + for (const int index : it->second) { + const Box& ibox = amrex::convert(amrex::coarsen(abox[index],cr),t); + if (bx.intersects(ibox)) { + intersect_boxes.push_back(ibox); } - } else { - for (const int index : it->second) - { - const Box& ibox = m_bat.m_op.m_bndryReg(abox[index]); - const Box& isect = bx & ibox; - - if (isect.ok()) - { - newbl.clear(); - for (const Box& b : bl) { - amrex::boxDiff(newdiff, b, isect); - newbl.join(newdiff); - } - bl.swap(newbl); - } + } + } + }); + } else { + AMREX_LOOP_3D(cbx, i, j, k, + { + auto it = BoxHashMap.find(IntVect(AMREX_D_DECL(i,j,k))); + if (it != TheEnd) { + for (const int index : it->second) { + const Box& ibox = m_bat.m_op.m_bndryReg(abox[index]); + if (bx.intersects(ibox)) { + intersect_boxes.push_back(ibox); } } } + }); + } + + BoxList newbl(bl.ixType()); + BoxList newdiff(bl.ixType()); + for (auto const& ibox : intersect_boxes) { + newbl.clear(); + for (Box const& b : bl) { + amrex::boxDiff(newdiff, b, ibox); + newbl.join(newdiff); } + bl.swap(newbl); + if (bl.isEmpty()) { return; } } } @@ -1553,6 +1556,24 @@ BoxArray::uniqify () } m_bat.set_coarsen_ratio(IntVect::TheUnitVector()); } + m_simplified_list.reset(); +} + +BoxList const& +BoxArray::simplified_list () const +{ + if (!m_simplified_list) { + BoxList bl = boxList(); + bl.ordered_simplify(); + m_simplified_list = std::make_shared(std::move(bl)); + } + return *m_simplified_list; +} + +BoxArray +BoxArray::simplified () const +{ + return BoxArray(simplified_list()).convert(ixType()); } std::ostream& diff --git a/Src/Base/AMReX_BoxDomain.cpp b/Src/Base/AMReX_BoxDomain.cpp index dc3a07a2f62..8c4a7c4a7d3 100644 --- a/Src/Base/AMReX_BoxDomain.cpp +++ b/Src/Base/AMReX_BoxDomain.cpp @@ -63,6 +63,7 @@ BoxDomain& BoxDomain::complementIn (const Box& b, const BoxDomain& bl) { + BL_PROFILE("BoxDomain::complementIn()"); BoxList::complementIn(b,bl); BL_ASSERT(ok()); return *this; diff --git a/Src/Base/AMReX_BoxList.H b/Src/Base/AMReX_BoxList.H index aa9612f3fdf..3513b5cae63 100644 --- a/Src/Base/AMReX_BoxList.H +++ b/Src/Base/AMReX_BoxList.H @@ -139,11 +139,13 @@ public: //! Remove empty Boxes from this BoxList. BoxList& removeEmpty(); - BoxList& complementIn (const Box& b, - const BoxList& bl); - BoxList& complementIn (const Box& b, - BoxList&& bl); + BoxList& complementIn (const Box& b, const BoxList& bl); + BoxList& complementIn (const Box& b, BoxList&& bl); BoxList& complementIn (const Box& b, const BoxArray& ba); + BoxList& parallelComplementIn (const Box& b, const BoxList& bl); + BoxList& parallelComplementIn (const Box& b, BoxList&& bl); + BoxList& parallelComplementIn (const Box& b, const BoxArray& ba); + //! Refine each Box in the BoxList by the ratio. BoxList& refine (int ratio); //! Refine each Box in the BoxList by the ratio. @@ -173,6 +175,8 @@ public: * is O(N-squared) while the other algorithm is O(N). */ int simplify (bool best = false); + //! Assuming the boxes are nicely ordered + int ordered_simplify (); //! Forces each Box in the BoxList to have sides of length <= chunk. BoxList& maxSize (int chunk); //! Forces each Box in the BoxList to have dimth side of length <= chunk[dim]. @@ -210,9 +214,11 @@ public: std::swap(btype, rhs.btype); } + void Bcast (); + private: //! Core simplify routine. - int simplify_doit (bool best); + int simplify_doit (int depth); //! The list of Boxes. Vector m_lbox; diff --git a/Src/Base/AMReX_BoxList.cpp b/Src/Base/AMReX_BoxList.cpp index 71b9da577c8..c70a0cfa8df 100644 --- a/Src/Base/AMReX_BoxList.cpp +++ b/Src/Base/AMReX_BoxList.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef _OPENMP #include @@ -69,7 +70,7 @@ BoxList::clear () void BoxList::join (const BoxList& blist) { - BL_ASSERT(ixType() == blist.ixType()); + BL_ASSERT(blist.size() == 0 || ixType() == blist.ixType()); m_lbox.insert(std::end(m_lbox), std::begin(blist), std::end(blist)); } @@ -83,7 +84,7 @@ BoxList::join (const Vector& barr) void BoxList::catenate (BoxList& blist) { - BL_ASSERT(ixType() == blist.ixType()); + BL_ASSERT(blist.size() == 0 || ixType() == blist.ixType()); m_lbox.insert(std::end(m_lbox), std::begin(blist), std::end(blist)); blist.m_lbox.clear(); } @@ -98,8 +99,7 @@ BoxList::removeEmpty() } BoxList -intersect (const BoxList& bl, - const Box& b) +intersect (const BoxList& bl, const Box& b) { BL_ASSERT(bl.ixType() == b.ixType()); BoxList newbl(bl); @@ -108,8 +108,7 @@ intersect (const BoxList& bl, } BoxList -refine (const BoxList& bl, - int ratio) +refine (const BoxList& bl, int ratio) { BoxList nbl(bl); nbl.refine(ratio); @@ -117,8 +116,7 @@ refine (const BoxList& bl, } BoxList -coarsen (const BoxList& bl, - int ratio) +coarsen (const BoxList& bl, int ratio) { BoxList nbl(bl); nbl.coarsen(ratio); @@ -126,8 +124,7 @@ coarsen (const BoxList& bl, } BoxList -accrete (const BoxList& bl, - int sz) +accrete (const BoxList& bl, int sz) { BoxList nbl(bl); nbl.accrete(sz); @@ -303,8 +300,7 @@ BoxList::intersect (const BoxList& bl) } BoxList -complementIn (const Box& b, - const BoxList& bl) +complementIn (const Box& b, const BoxList& bl) { BL_ASSERT(bl.ixType() == b.ixType()); BoxList newb(b.ixType()); @@ -313,16 +309,14 @@ complementIn (const Box& b, } BoxList& -BoxList::complementIn (const Box& b, - const BoxList& bl) +BoxList::complementIn (const Box& b, const BoxList& bl) { BoxArray ba(bl); return complementIn(b, ba); } BoxList& -BoxList::complementIn (const Box& b, - BoxList&& bl) +BoxList::complementIn (const Box& b, BoxList&& bl) { BoxArray ba(std::move(bl)); return complementIn(b, ba); @@ -406,6 +400,112 @@ BoxList::complementIn (const Box& b, const BoxArray& ba) return *this; } +BoxList& +BoxList::parallelComplementIn (const Box& b, const BoxList& bl) +{ + return parallelComplementIn(b, BoxArray(bl)); +} + +BoxList& +BoxList::parallelComplementIn (const Box& b, BoxList&& bl) +{ + return parallelComplementIn(b, BoxArray(std::move(bl))); +} + +BoxList& +BoxList::parallelComplementIn (const Box& b, BoxArray const& ba) +{ + BL_PROFILE("BoxList::parallelComplementIn()"); +#ifndef AMREX_USE_MPI + return complementIn(b,ba); +#else + if (ba.size() <= 8) + { + return complementIn(b,ba); + } + else + { + BL_PROFILE_VAR("BoxList::pci", boxlistpci); + + Long npts_avgbox; + Box mbox = ba.minimalBox(npts_avgbox); + *this = amrex::boxDiff(b, mbox); + auto mytyp = ixType(); + + BoxList bl_mesh(mbox & b); + +#if (AMREX_SPACEDIM == 1) + Real s_avgbox = npts_avgbox; +#elif (AMREX_SPACEDIM == 2) + Real s_avgbox = std::sqrt(npts_avgbox); +#elif (AMREX_SPACEDIM == 3) + Real s_avgbox = std::cbrt(npts_avgbox); +#endif + + const int block_size = 4 * std::max(1,static_cast(std::ceil(s_avgbox/4.))*4); + bl_mesh.maxSize(block_size); + const int N = bl_mesh.size(); + + const int nprocs = ParallelContext::NProcsSub(); + const int myproc = ParallelContext::MyProcSub(); + const int navg = N / nprocs; + const int nextra = N - navg*nprocs; + const int ilo = (myproc < nextra) ? myproc*(navg+1) : myproc*navg+nextra; + const int ihi = (myproc < nextra) ? ilo+navg+1-1 : ilo+navg-1; + + Vector local_boxes; + +#ifdef _OPENMP + bool start_omp_parallel = !omp_in_parallel(); + const int nthreads = omp_get_max_threads(); +#else + bool start_omp_parallel = false; +#endif + + if (start_omp_parallel) + { +#ifdef _OPENMP + Vector bl_priv(nthreads, BoxList(mytyp)); + int ntot = 0; +#pragma omp parallel reduction(+:ntot) + { + BoxList bl_tmp(mytyp); + auto& vbox = bl_priv[omp_get_thread_num()].m_lbox; +#pragma omp for + for (int i = ilo; i <= ihi; ++i) + { + ba.complementIn(bl_tmp, bl_mesh.m_lbox[i]); + vbox.insert(std::end(vbox), std::begin(bl_tmp), std::end(bl_tmp)); + } + ntot += bl_tmp.size(); + } + local_boxes.reserve(ntot); + for (auto& bl : bl_priv) { + local_boxes.insert(std::end(local_boxes), std::begin(bl), std::end(bl)); + } +#else + amrex::Abort("BoxList::complementIn: how did this happen"); +#endif + } + else + { + BoxList bl_tmp(mytyp); + for (int i = ilo; i <= ihi; ++i) + { + ba.complementIn(bl_tmp, bl_mesh.m_lbox[i]); + local_boxes.insert(std::end(local_boxes), std::begin(bl_tmp), std::end(bl_tmp)); + } + } + + amrex::AllGatherBoxes(local_boxes, this->size()); + local_boxes.insert(std::end(local_boxes), std::begin(m_lbox), std::end(m_lbox)); + std::swap(m_lbox, local_boxes); + + return *this; + } +#endif +} + BoxList& BoxList::refine (int ratio) { @@ -467,8 +567,7 @@ BoxList::accrete (const IntVect& sz) } BoxList& -BoxList::shift (int dir, - int nzones) +BoxList::shift (int dir, int nzones) { for (auto& bx : m_lbox) { @@ -478,8 +577,7 @@ BoxList::shift (int dir, } BoxList& -BoxList::shiftHalf (int dir, - int num_halfs) +BoxList::shiftHalf (int dir, int num_halfs) { for (auto& bx : m_lbox) { @@ -503,8 +601,7 @@ BoxList::shiftHalf (const IntVect& iv) // BoxList -boxDiff (const Box& b1in, - const Box& b2) +boxDiff (const Box& b1in, const Box& b2) { BL_ASSERT(b1in.sameType(b2)); BoxList bl_diff(b1in.ixType()); @@ -564,11 +661,28 @@ BoxList::simplify (bool best) std::sort(m_lbox.begin(), m_lbox.end(), [](const Box& l, const Box& r) { return l.smallEnd() < r.smallEnd(); }); - return simplify_doit(best); + // + // If we're not looking for the "best" we can do in one pass, we + // limit how far afield we look for abutting boxes. This greatly + // speeds up this routine for large numbers of boxes. It does not + // do quite as good a job though as full brute force. + // + int depth = best ? size() : 100; + return simplify_doit(depth); } int -BoxList::simplify_doit (bool best) +BoxList::ordered_simplify () +{ + int count; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + count = simplify_doit(1); + } + return count; +} + +int +BoxList::simplify_doit (int depth) { // // Try to merge adjacent boxes. @@ -579,16 +693,9 @@ BoxList::simplify_doit (bool best) { const int* alo = bla->loVect(); const int* ahi = bla->hiVect(); - // - // If we're not looking for the "best" we can do in one pass, we - // limit how far afield we look for abutting boxes. This greatly - // speeds up this routine for large numbers of boxes. It does not - // do quite as good a job though as full brute force. - // - const int MaxCnt = (best ? size() : 100); iterator blb = bla + 1; - for (int cnt = 0; blb != End && cnt < MaxCnt; ++cnt, ++blb) + for (int cnt = 0; blb != End && cnt < depth; ++cnt, ++blb) { const int* blo = blb->loVect(); const int* bhi = blb->hiVect(); @@ -663,55 +770,55 @@ BoxList& BoxList::maxSize (const IntVect& chunk) { Vector new_boxes; - - for (int i = 0; i < AMREX_SPACEDIM; ++i) - { - new_boxes.clear(); - for (auto& bx : m_lbox) - { - const IntVect& boxlen = bx.size(); - const int* len = boxlen.getVect(); - - if (len[i] > chunk[i]) - { - // - // Reduce by powers of 2. - // - int ratio = 1; - int bs = chunk[i]; - int nlen = len[i]; - while ((bs%2 == 0) && (nlen%2 == 0)) - { - ratio *= 2; + for (auto const& bx : m_lbox) { + const IntVect boxlen = amrex::enclosedCells(bx).size(); + const IntVect boxlo = bx.smallEnd(); + IntVect ratio{1}, numblk{1}, extra{0}; + IntVect sz = boxlen; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (boxlen[idim] > chunk[idim]) { + int bs = chunk[idim]; + int nlen = boxlen[idim]; + while ((bs%2 == 0) && (nlen%2 == 0)) { + ratio[idim] *= 2; bs /= 2; nlen /= 2; } - // - // Determine number and size of (coarsened) cuts. - // - const int numblk = nlen/bs + (nlen%bs ? 1 : 0); - const int sz = nlen/numblk; - const int extra = nlen%numblk; - // - // Number of cuts = number of blocks - 1. - // - for (int k = 0; k < numblk-1; k++) - { - // - // Compute size of this chunk, expand by power of 2. - // - const int ksize = (k < extra ? sz+1 : sz) * ratio; - // - // Chop from high end. - // - const int pos = bx.bigEnd(i) - ksize + 1; - - new_boxes.push_back(bx.chop(i,pos)); - } + numblk[idim] = (nlen+bs-1)/bs; + sz[idim] = nlen/numblk[idim]; + extra[idim] = nlen - sz[idim]*numblk[idim]; } } - join(new_boxes); + if (numblk == 1) { + new_boxes.push_back(bx); + } else { +#if (AMREX_SPACEDIM == 3) + for (int k = 0; k < numblk[2]; ++k) { + int klo = (k < extra[2]) ? k*(sz[2]+1)*ratio[2] : (k*sz[2]+extra[2])*ratio[2]; + int khi = (k < extra[2]) ? klo+(sz[2]+1)*ratio[2]-1 : klo+sz[2]*ratio[2]-1; + klo += boxlo[2]; + khi += boxlo[2]; +#endif +#if (AMREX_SPACEDIM >= 2) + for (int j = 0; j < numblk[1]; ++j) { + int jlo = (j < extra[1]) ? j*(sz[1]+1)*ratio[1] : (j*sz[1]+extra[1])*ratio[1]; + int jhi = (j < extra[1]) ? jlo+(sz[1]+1)*ratio[1]-1 : jlo+sz[1]*ratio[1]-1; + jlo += boxlo[1]; + jhi += boxlo[1]; +#endif + for (int i = 0; i < numblk[0]; ++i) { + int ilo = (i < extra[0]) ? i*(sz[0]+1)*ratio[0] : (i*sz[0]+extra[0])*ratio[0]; + int ihi = (i < extra[0]) ? ilo+(sz[0]+1)*ratio[0]-1 : ilo+sz[0]*ratio[0]-1; + ilo += boxlo[0]; + ihi += boxlo[0]; + new_boxes.push_back(Box(IntVect(AMREX_D_DECL(ilo,jlo,klo)), + IntVect(AMREX_D_DECL(ihi,jhi,khi))). + convert(ixType())); + AMREX_D_TERM(},},}) + } } + std::swap(new_boxes, m_lbox); + return *this; } @@ -773,8 +880,7 @@ BoxList::convert (IndexType typ) noexcept } std::ostream& -operator<< (std::ostream& os, - const BoxList& blist) +operator<< (std::ostream& os, const BoxList& blist) { BoxList::const_iterator bli = blist.begin(), End = blist.end(); os << "(BoxList " << blist.size() << ' ' << blist.ixType() << '\n'; @@ -802,4 +908,16 @@ BoxList::operator== (const BoxList& rhs) const return true; } +void +BoxList::Bcast () +{ + int nboxes = this->size(); + const int IOProcNumber = ParallelDescriptor::IOProcessorNumber(); + ParallelDescriptor::Bcast(&nboxes, 1, IOProcNumber); + if (ParallelDescriptor::MyProc() != IOProcNumber) { + m_lbox.resize(nboxes); + } + ParallelDescriptor::Bcast(m_lbox.data(), nboxes, IOProcNumber); +} + } diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H index 211cb97d6ab..05bbe19b488 100644 --- a/Src/Base/AMReX_CArena.H +++ b/Src/Base/AMReX_CArena.H @@ -59,7 +59,7 @@ public: void PrintUsage (std::string const& name) const; //! The default memory hunk size to grab from the heap. - enum { DefaultHunkSize = 1024*1024*8 }; + constexpr static std::size_t DefaultHunkSize = 1024*1024*8; protected: //! The nodes in our free list and block list. diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp index ce3e1bca307..f6bde9c0a62 100644 --- a/Src/Base/AMReX_CArena.cpp +++ b/Src/Base/AMReX_CArena.cpp @@ -121,8 +121,10 @@ CArena::free (void* vp) // `vp' had better be in the busy list. // auto busy_it = m_busylist.find(Node(vp,0,0)); - - BL_ASSERT(!(busy_it == m_busylist.end())); + if (busy_it == m_busylist.end()) { + amrex::Abort("CArena::free: unknown pointer"); + return; + } BL_ASSERT(m_freelist.find(*busy_it) == m_freelist.end()); m_actually_used -= busy_it->size(); diff --git a/Src/Base/AMReX_CoordSys.H b/Src/Base/AMReX_CoordSys.H index e62262e9a00..647600de9d1 100644 --- a/Src/Base/AMReX_CoordSys.H +++ b/Src/Base/AMReX_CoordSys.H @@ -82,7 +82,7 @@ public: GpuArray CellSizeArray () const noexcept { BL_ASSERT(ok); - return { AMREX_D_DECL(dx[0],dx[1],dx[2]) }; + return {{ AMREX_D_DECL(dx[0],dx[1],dx[2]) }}; } //! Returns the inverse cellsize for each coordinate direction. @@ -93,7 +93,7 @@ public: GpuArray InvCellSizeArray () const noexcept { BL_ASSERT(ok); - return { AMREX_D_DECL(inv_dx[0],inv_dx[1],inv_dx[2]) }; + return {{ AMREX_D_DECL(inv_dx[0],inv_dx[1],inv_dx[2]) }}; } //! Returns location of cell center in specified direction. diff --git a/Src/Base/AMReX_CoordSys.cpp b/Src/Base/AMReX_CoordSys.cpp index fd1529942f1..7d748960560 100644 --- a/Src/Base/AMReX_CoordSys.cpp +++ b/Src/Base/AMReX_CoordSys.cpp @@ -206,7 +206,7 @@ CoordSys::SetVolume (FArrayBox& a_volfab, AMREX_ASSERT(region.cellCentered()); auto vol = a_volfab.array(); - GpuArray a_dx{AMREX_D_DECL(dx[0], dx[1], dx[2])}; + GpuArray a_dx{{AMREX_D_DECL(dx[0], dx[1], dx[2])}}; #if (AMREX_SPACEDIM == 3) AMREX_ASSERT(IsCartesian()); @@ -216,7 +216,7 @@ CoordSys::SetVolume (FArrayBox& a_volfab, vol(i,j,k) = dv; }); #else - GpuArray a_offset{AMREX_D_DECL(offset[0],offset[1],offset[2])}; + GpuArray a_offset{{AMREX_D_DECL(offset[0],offset[1],offset[2])}}; int coord = (int) c_sys; AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( region, tbx, { @@ -239,6 +239,8 @@ CoordSys::SetDLogA (FArrayBox& a_dlogafab, const Box& region, int dir) const { + amrex::ignore_unused(dir); + AMREX_ASSERT(ok); AMREX_ASSERT(region.cellCentered()); @@ -491,6 +493,7 @@ CoordSys::Volume (const Real xlo[AMREX_SPACEDIM], Real CoordSys::AreaLo (const IntVect& point, int dir) const noexcept { + amrex::ignore_unused(point,dir); #if (AMREX_SPACEDIM==2) Real xlo[AMREX_SPACEDIM]; switch (c_sys) @@ -501,6 +504,7 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept case 0: return dx[1]; case 1: return dx[0]; } + return 0.; // to silent compiler warning case RZ: LoNode(point,xlo); switch (dir) @@ -508,6 +512,7 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept case 0: return TWOPI*dx[1]*xlo[0]; case 1: return ((xlo[0]+dx[0])*(xlo[0]+dx[0])-xlo[0]*xlo[0])*(0.5*TWOPI); } + return 0.; // to silent compiler warning default: AMREX_ASSERT(0); } @@ -526,6 +531,7 @@ CoordSys::AreaLo (const IntVect& point, int dir) const noexcept Real CoordSys::AreaHi (const IntVect& point, int dir) const noexcept { + amrex::ignore_unused(point,dir); #if (AMREX_SPACEDIM==2) Real xhi[AMREX_SPACEDIM]; switch (c_sys) @@ -536,6 +542,7 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept case 0: return dx[1]; case 1: return dx[0]; } + return 0.; // to silent compiler warning case RZ: HiNode(point,xhi); switch (dir) @@ -543,6 +550,7 @@ CoordSys::AreaHi (const IntVect& point, int dir) const noexcept case 0: return TWOPI*dx[1]*xhi[0]; case 1: return (xhi[0]*xhi[0]-(xhi[0]-dx[0])*(xhi[0]-dx[0]))*(TWOPI*0.5); } + return 0.; // to silent compiler warning default: AMREX_ASSERT(0); } diff --git a/Src/Base/AMReX_Dim3.H b/Src/Base/AMReX_Dim3.H index 3a79ca3a59c..f8205b14189 100644 --- a/Src/Base/AMReX_Dim3.H +++ b/Src/Base/AMReX_Dim3.H @@ -7,31 +7,8 @@ namespace amrex { -#ifdef AMREX_USE_HIP -struct Dim3 { int x; int y; int z; - AMREX_GPU_HOST_DEVICE - constexpr Dim3 () : x(0), y(0), z(0) {} - AMREX_GPU_HOST_DEVICE - constexpr Dim3 (int x_, int y_, int z_) // xxxxx HIP: todo - : x(x_), y(y_), z(z_) {} - AMREX_GPU_HOST_DEVICE - constexpr Dim3 (Dim3 const& rhs) - : x(rhs.x), y(rhs.y), z(rhs.z) {} -}; -struct XDim3 { Real x; Real y; Real z; - AMREX_GPU_HOST_DEVICE - constexpr XDim3 () : x(0._rt), y(0._rt), z(0._rt) {} - AMREX_GPU_HOST_DEVICE - constexpr XDim3 (Real x_, Real y_, Real z_) // xxxxx HIP: todo - : x(x_), y(y_), z(z_) {} - AMREX_GPU_HOST_DEVICE - constexpr XDim3 (XDim3 const& rhs) - : x(rhs.x), y(rhs.y), z(rhs.z) {} -}; -#else struct Dim3 { int x; int y; int z; }; struct XDim3 { Real x; Real y; Real z; }; -#endif inline std::ostream& operator<< (std::ostream& os, const Dim3& d) { os << '(' << d.x << ',' << d.y << ',' << d.z << ')'; diff --git a/Src/Base/AMReX_DistributionMapping.H b/Src/Base/AMReX_DistributionMapping.H index e0982d7b303..9b4d2722353 100644 --- a/Src/Base/AMReX_DistributionMapping.H +++ b/Src/Base/AMReX_DistributionMapping.H @@ -236,7 +236,9 @@ class DistributionMapping * if use_box_vol is true, weight boxes by their volume in Distribute * otherwise, all boxes will be treated with equal weight */ - static std::vector > makeSFC (const BoxArray& ba, bool use_box_vol=true); + static std::vector > makeSFC (const BoxArray& ba, + bool use_box_vol=true, + const int nprocs=ParallelContext::NProcsSub() ); /** \brief Computes the average cost per MPI rank given a distribution mapping * global cost vector. diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp index 7447a750039..b268226511e 100644 --- a/Src/Base/AMReX_DistributionMapping.cpp +++ b/Src/Base/AMReX_DistributionMapping.cpp @@ -294,6 +294,7 @@ DistributionMapping::LeastUsedTeams (Vector & rteam, rteam.push_back(0); rworker.clear(); rworker.push_back(Vector(1,0)); + amrex::ignore_unused(nteams,nworkers); #endif } @@ -856,7 +857,8 @@ DistributionMapping::KnapSackProcessorMap (const BoxArray& boxes, int nprocs) { BL_ASSERT(boxes.size() > 0); - BL_ASSERT(m_ref->m_pmap.size() == boxes.size()); + + m_ref->m_pmap.resize(boxes.size()); if (boxes.size() <= nprocs || nprocs < 2) { @@ -882,53 +884,161 @@ namespace class Compare { public: + AMREX_FORCE_INLINE bool operator () (const SFCToken& lhs, const SFCToken& rhs) const; }; - - SFCToken (int box, const IntVect& idx, Real vol) - : - m_box(box), m_idx(idx), m_vol(vol) {} - - int m_box; - IntVect m_idx; - Real m_vol; - - static int MaxPower; + int m_box; + Array m_morton; }; } -int SFCToken::MaxPower = 64; - +AMREX_FORCE_INLINE bool SFCToken::Compare::operator () (const SFCToken& lhs, const SFCToken& rhs) const { - for (int i = SFCToken::MaxPower - 1; i >= 0; --i) +#if (AMREX_SPACEDIM == 1) + return lhs.m_morton[0] < rhs.m_morton[0]; +#elif (AMREX_SPACEDIM == 2) + return (lhs.m_morton[1] < rhs.m_morton[1]) || + ((lhs.m_morton[1] == rhs.m_morton[1]) && + (lhs.m_morton[0] < rhs.m_morton[0])); +#else + return (lhs.m_morton[2] < rhs.m_morton[2]) || + ((lhs.m_morton[2] == rhs.m_morton[2]) && + ((lhs.m_morton[1] < rhs.m_morton[1]) || + ((lhs.m_morton[1] == rhs.m_morton[1]) && + (lhs.m_morton[0] < rhs.m_morton[0])))); +#endif +} + +namespace { +#if (AMREX_SPACEDIM == 3) + AMREX_FORCE_INLINE + uint32_t make_space (uint32_t x) + { + // x : 0000,0000,0000,0000,0000,00a9,8765,4321 + x = (x | (x << 16)) & 0x030000FF; + // x << 16 : 0000,00a9,8765,4321,0000,0000,0000,0000 + // x | (x << 16): 0000,00a9,8765,4321,0000,00a9,8765,4321 + // 0x030000FF : 0000,0011,0000,0000,0000,0000,1111,1111 + // x : 0000,00a9,0000,0000,0000,0000,8765,4321 + x = (x | (x << 8)) & 0x0300F00F; + // x << 8 : 0000,0000,0000,0000,8765,4321,0000,0000 + // x | (x << 8) : 0000,00a9,0000,0000,8765,4321,8765,4321 + // 0x0300F00F : 0000,0011,0000,0000,1111,0000,0000,1111 + // x : 0000,00a9,0000,0000,8765,0000,0000,4321 + x = (x | (x << 4)) & 0x030C30C3; + // x << 4 : 00a9,0000,0000,8765,0000,0000,4321,0000 + // x | (x << 4) : 00a9,00a9,0000,8765,8765,0000,4321,4321 + // 0x030C30C3 : 0000,0011,0000,1100,0011,0000,1100,0011 + // x : 0000,00a9,0000,8700,0065,0000,4300,0021 + x = (x | (x << 2)) & 0x09249249; + // x << 2 : 0000,a900,0087,0000,6500,0043,0000,2100 + // x | (x << 2) : 0000,a9a9,0087,8700,6565,0043,4300,2121 + // 0x09249249 : 0000,1001,0010,0100,1001,0010,0100,1001 + // x : 0000,a009,0080,0700,6005,0040,0300,2001 + return x; + } +#elif (AMREX_SPACEDIM == 2) + AMREX_FORCE_INLINE + uint32_t make_space (uint32_t x) { - const int N = (1<= 0; --j) - { - const int il = lhs.m_idx[j]/N; - const int ir = rhs.m_idx[j]/N; + AMREX_FORCE_INLINE + SFCToken makeSFCToken (int box_index, IntVect const& iv) + { + SFCToken token; + token.m_box = box_index; + +#if (AMREX_SPACEDIM == 3) + + constexpr int imin = -(1 << 29); + AMREX_ASSERT_WITH_MESSAGE(AMREX_D_TERM(iv[0] >= imin && iv[0] < -imin, + && iv[1] >= imin && iv[1] < -imin, + && iv[2] >= imin && iv[2] < -imin), + "SFCToken: index out of range"); + uint32_t x = iv[0] - imin; + uint32_t y = iv[1] - imin; + uint32_t z = iv[2] - imin; + // extract lowest 10 bits and make space for interleaving + token.m_morton[0] = make_space(x & 0x3FF) + | (make_space(y & 0x3FF) << 1) + | (make_space(z & 0x3FF) << 2); + x = x >> 10; + y = y >> 10; + z = z >> 10; + token.m_morton[1] = make_space(x & 0x3FF) + | (make_space(y & 0x3FF) << 1) + | (make_space(z & 0x3FF) << 2); + x = x >> 10; + y = y >> 10; + z = z >> 10; + token.m_morton[2] = make_space(x & 0x3FF) + | (make_space(y & 0x3FF) << 1) + | (make_space(z & 0x3FF) << 2); + +#elif (AMREX_SPACEDIM == 2) + + constexpr uint32_t offset = 1u << 31; + static_assert(static_cast(std::numeric_limits::max())+1 == offset, + "INT_MAX != (1<<31)-1"); + uint32_t x = (iv[0] >= 0) ? static_cast(iv[0]) + offset + : static_cast(iv[0]-std::numeric_limits::lowest()); + uint32_t y = (iv[1] >= 0) ? static_cast(iv[1]) + offset + : static_cast(iv[1]-std::numeric_limits::lowest()); + // extract lowest 16 bits and make sapce for interleaving + token.m_morton[0] = make_space(x & 0xFFFF) + | (make_space(y & 0xFFFF) << 1); + x = x >> 16; + y = y >> 16; + token.m_morton[1] = make_space(x) | (make_space(y) << 1); + +#elif (AMREX_SPACEDIM == 1) + + constexpr uint32_t offset = 1u << 31; + static_assert(static_cast(std::numeric_limits::max())+1 == offset, + "INT_MAX != (1<<31)-1"); + token.m_morton[0] = (iv[0] >= 0) ? static_cast(iv[0]) + offset + : static_cast(iv[0]-std::numeric_limits::lowest()); - if (il < ir) - { - return true; - } - else if (il > ir) - { - return false; - } - } +#else + static_assert(false,"AMREX_SPACEDIM != 1, 2 or 3"); +#endif + + return token; } - return false; } static void Distribute (const std::vector& tokens, + const std::vector& wgts, int nprocs, Real volpercpu, std::vector< std::vector >& v) @@ -944,8 +1054,7 @@ Distribute (const std::vector& tokens, for (const auto &t : tokens) { Print() << " " << idx++ << ": " << t.m_box << ": " - << t.m_idx << ": " - << t.m_vol << std::endl; + << t.m_morton << std::endl; } } @@ -963,7 +1072,7 @@ Distribute (const std::vector& tokens, K < TSZ && (i == (nprocs-1) || (vol < volpercpu)); ++K) { - vol += tokens[K].m_vol; + vol += wgts[tokens[K].m_box]; ++cnt; v[i].push_back(tokens[K].m_box); @@ -977,7 +1086,7 @@ Distribute (const std::vector& tokens, { --K; v[i].pop_back(); - totalvol -= tokens[K].m_vol; + totalvol -= wgts[tokens[K].m_box]; } } @@ -993,9 +1102,8 @@ Distribute (const std::vector& tokens, BL_ASSERT(box == t.m_box); Print() << " " << idx << ": " << t.m_box << ": " - << t.m_idx << ": " - << t.m_vol << std::endl; - rank_vol += t.m_vol; + << t.m_morton << std::endl; + rank_vol += wgts[t.m_box]; idx++; } Print() << " Total Rank Vol: " << rank_vol << std::endl; @@ -1047,34 +1155,15 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, << nprocs << ", " << nteams << ", " << nworkers << ")\n"; } - std::vector tokens; - const int N = boxes.size(); - + std::vector tokens; tokens.reserve(N); - - int maxijk = 0; - for (int i = 0; i < N; ++i) { - const Box& bx = boxes[i]; - tokens.push_back(SFCToken(i,bx.smallEnd(),wgts[i])); - - const SFCToken& token = tokens.back(); - - AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);, - maxijk = std::max(maxijk, token.m_idx[1]);, - maxijk = std::max(maxijk, token.m_idx[2]);); + const Box& bx = boxes[i]; + tokens.push_back(makeSFCToken(i, bx.smallEnd())); } // - // Set SFCToken::MaxPower for BoxArray. - // - int m = 0; - for ( ; (1 << m) <= maxijk; ++m) { - ; // do nothing - } - SFCToken::MaxPower = m; - // // Put'm in Morton space filling curve order. // std::sort(tokens.begin(), tokens.end(), SFCToken::Compare()); @@ -1082,14 +1171,14 @@ DistributionMapping::SFCProcessorMapDoIt (const BoxArray& boxes, // Split'm up as equitably as possible per team. // Real volperteam = 0; - for (const SFCToken& tok : tokens) { - volperteam += tok.m_vol; + for (Long wt : wgts) { + volperteam += wt; } volperteam /= nteams; std::vector< std::vector > vec(nteams); - Distribute(tokens,nteams,volperteam,vec); + Distribute(tokens,wgts,nteams,volperteam,vec); // vec has a size of nteams and vec[] holds a vector of box ids. @@ -1311,33 +1400,14 @@ DistributionMapping::RRSFCDoIt (const BoxArray& boxes, amrex::Abort("Team support is not implemented yet in RRSFC"); #endif - std::vector tokens; - const int nboxes = boxes.size(); - + std::vector tokens; tokens.reserve(nboxes); - - int maxijk = 0; - for (int i = 0; i < nboxes; ++i) { - const Box& bx = boxes[i]; - tokens.push_back(SFCToken(i,bx.smallEnd(),0.0)); - - const SFCToken& token = tokens.back(); - - AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);, - maxijk = std::max(maxijk, token.m_idx[1]);, - maxijk = std::max(maxijk, token.m_idx[2]);); - } - // - // Set SFCToken::MaxPower for BoxArray. - // - int m = 0; - for ( ; (1 << m) <= maxijk; ++m) { - ; // do nothing + const Box& bx = boxes[i]; + tokens.push_back(makeSFCToken(i, bx.smallEnd())); } - SFCToken::MaxPower = m; // // Put'm in Morton space filling curve order. // @@ -1471,6 +1541,8 @@ DistributionMapping::makeKnapSack (const LayoutData& rcost_local, r = DistributionMapping(pmap); } } +#else + amrex::ignore_unused(broadcastToAll); #endif return r; @@ -1522,42 +1594,45 @@ DistributionMapping::ComputeDistributionMappingEfficiency (const DistributionMap *efficiency = (std::accumulate(rankToCost.begin(), rankToCost.end(), 0.0) / (nprocs*maxCost)); } - -DistributionMapping -DistributionMapping::makeKnapSack (const MultiFab& weight, int nmax) -{ - BL_PROFILE("makeKnapSack"); - - DistributionMapping r; - Vector cost(weight.size()); -#ifdef BL_USE_MPI - { - Vector rcost(cost.size(), 0.0); +namespace { +Vector +gather_weights (const MultiFab& weight) +{ +#ifdef AMREX_USE_MPI + LayoutData costld(weight.boxArray(),weight.DistributionMap()); #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (Gpu::notInLaunchRegion()) #endif - for (MFIter mfi(weight); mfi.isValid(); ++mfi) { - int i = mfi.index(); - rcost[i] = weight[mfi].sum(mfi.validbox(),0); - } - - ParallelAllReduce::Sum(&rcost[0], rcost.size(), ParallelContext::CommunicatorSub()); - - Real wmax = *std::max_element(rcost.begin(), rcost.end()); - Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; - - for (int i = 0; i < rcost.size(); ++i) { - cost[i] = Long(rcost[i]*scale) + 1L; - } + for (MFIter mfi(weight); mfi.isValid(); ++mfi) { + costld[mfi] = weight[mfi].sum(mfi.validbox(),0); } + Vector rcost(weight.size()); + ParallelDescriptor::GatherLayoutDataToVector(costld, rcost, + ParallelContext::IOProcessorNumberSub()); + ParallelDescriptor::Bcast(rcost.data(), rcost.size(), ParallelContext::IOProcessorNumberSub()); + Real wmax = *std::max_element(rcost.begin(), rcost.end()); + Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; + Vector lcost(rcost.size()); + for (int i = 0; i < rcost.size(); ++i) { + lcost[i] = static_cast(rcost[i]*scale) + 1L; + } + return lcost; +#else + return Vector(weight.size(), 1L); #endif +} +} +DistributionMapping +DistributionMapping::makeKnapSack (const MultiFab& weight, int nmax) +{ + BL_PROFILE("makeKnapSack"); + Vector cost = gather_weights(weight); int nprocs = ParallelContext::NProcsSub(); Real eff; - + DistributionMapping r; r.KnapSackProcessorMap(cost, nprocs, &eff, true, nmax); - return r; } @@ -1565,71 +1640,21 @@ DistributionMapping DistributionMapping::makeKnapSack (const MultiFab& weight, Real& eff, int nmax) { BL_PROFILE("makeKnapSack"); - - DistributionMapping r; - - Vector cost(weight.size()); -#ifdef BL_USE_MPI - { - Vector rcost(cost.size(), 0.0); -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(weight); mfi.isValid(); ++mfi) { - int i = mfi.index(); - rcost[i] = weight[mfi].sum(mfi.validbox(),0); - } - - ParallelAllReduce::Sum(&rcost[0], rcost.size(), ParallelContext::CommunicatorSub()); - - Real wmax = *std::max_element(rcost.begin(), rcost.end()); - Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; - - for (int i = 0; i < rcost.size(); ++i) { - cost[i] = Long(rcost[i]*scale) + 1L; - } - } -#endif - + Vector cost = gather_weights(weight); int nprocs = ParallelContext::NProcsSub(); - + DistributionMapping r; r.KnapSackProcessorMap(cost, nprocs, &eff, true, nmax); - return r; } DistributionMapping DistributionMapping::makeRoundRobin (const MultiFab& weight) { - DistributionMapping r; - - Vector cost(weight.size()); -#ifdef BL_USE_MPI - { - Vector rcost(cost.size(), 0.0); -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(weight); mfi.isValid(); ++mfi) { - int i = mfi.index(); - rcost[i] = weight[mfi].sum(mfi.validbox(),0); - } - - ParallelAllReduce::Sum(&rcost[0], rcost.size(), ParallelContext::CommunicatorSub()); - - Real wmax = *std::max_element(rcost.begin(), rcost.end()); - Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; - - for (int i = 0; i < rcost.size(); ++i) { - cost[i] = Long(rcost[i]*scale) + 1L; - } - } -#endif - + BL_PROFILE("makeRoundRobin"); + Vector cost = gather_weights(weight); int nprocs = ParallelContext::NProcsSub(); - + DistributionMapping r; r.RoundRobinProcessorMap(cost, nprocs); - return r; } @@ -1637,36 +1662,10 @@ DistributionMapping DistributionMapping::makeSFC (const MultiFab& weight, bool sort) { BL_PROFILE("makeSFC"); - - DistributionMapping r; - - Vector cost(weight.size()); -#ifdef BL_USE_MPI - { - Vector rcost(cost.size(), 0.0); -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(weight); mfi.isValid(); ++mfi) { - int i = mfi.index(); - rcost[i] = weight[mfi].sum(mfi.validbox(),0); - } - - ParallelAllReduce::Sum(&rcost[0], rcost.size(), ParallelContext::CommunicatorSub()); - - Real wmax = *std::max_element(rcost.begin(), rcost.end()); - Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; - - for (int i = 0; i < rcost.size(); ++i) { - cost[i] = Long(rcost[i]*scale) + 1L; - } - } -#endif - + Vector cost = gather_weights(weight); int nprocs = ParallelContext::NProcsSub(); - + DistributionMapping r; r.SFCProcessorMap(weight.boxArray(), cost, nprocs, sort); - return r; } @@ -1674,36 +1673,10 @@ DistributionMapping DistributionMapping::makeSFC (const MultiFab& weight, Real& eff, bool sort) { BL_PROFILE("makeSFC"); - - DistributionMapping r; - - Vector cost(weight.size()); -#ifdef BL_USE_MPI - { - Vector rcost(cost.size(), 0.0); -#ifdef _OPENMP -#pragma omp parallel -#endif - for (MFIter mfi(weight); mfi.isValid(); ++mfi) { - int i = mfi.index(); - rcost[i] = weight[mfi].sum(mfi.validbox(),0); - } - - ParallelAllReduce::Sum(&rcost[0], rcost.size(), ParallelContext::CommunicatorSub()); - - Real wmax = *std::max_element(rcost.begin(), rcost.end()); - Real scale = (wmax == 0) ? 1.e9 : 1.e9/wmax; - - for (int i = 0; i < rcost.size(); ++i) { - cost[i] = Long(rcost[i]*scale) + 1L; - } - } -#endif - + Vector cost = gather_weights(weight); int nprocs = ParallelContext::NProcsSub(); - + DistributionMapping r; r.SFCProcessorMap(weight.boxArray(), cost, nprocs, eff, sort); - return r; } @@ -1812,57 +1785,43 @@ DistributionMapping::makeSFC (const LayoutData& rcost_local, r = DistributionMapping(pmap); } } +#else + amrex::ignore_unused(broadcastToAll); #endif return r; } std::vector > -DistributionMapping::makeSFC (const BoxArray& ba, bool use_box_vol) +DistributionMapping::makeSFC (const BoxArray& ba, bool use_box_vol, const int nprocs) { BL_PROFILE("makeSFC"); - std::vector tokens; - const int N = ba.size(); - + std::vector tokens; + std::vector wgts; tokens.reserve(N); - - int maxijk = 0; - - Real vol_sum = 0; + wgts.reserve(N); + Long vol_sum = 0; for (int i = 0; i < N; ++i) { - const Box& bx = ba[i]; - const auto & bx_vol = (use_box_vol ? bx.volume() : 1); - tokens.push_back(SFCToken(i,bx.smallEnd(),bx_vol)); - vol_sum += bx_vol; - - const SFCToken& token = tokens.back(); - - AMREX_D_TERM(maxijk = std::max(maxijk, token.m_idx[0]);, - maxijk = std::max(maxijk, token.m_idx[1]);, - maxijk = std::max(maxijk, token.m_idx[2]);); + const Box& bx = ba[i]; + tokens.push_back(makeSFCToken(i, bx.smallEnd())); + const Long v = use_box_vol ? bx.volume() : Long(1); + vol_sum += v; + wgts.push_back(v); } // - // Set SFCToken::MaxPower for BoxArray. - // - int m = 0; - for ( ; (1 << m) <= maxijk; ++m) { - ; // do nothing - } - SFCToken::MaxPower = m; - // // Put'm in Morton space filling curve order. // std::sort(tokens.begin(), tokens.end(), SFCToken::Compare()); - const int nprocs = ParallelContext::NProcsSub(); Real volper; volper = vol_sum / nprocs; std::vector< std::vector > r(nprocs); - Distribute(tokens, nprocs, volper, r); + + Distribute(tokens, wgts, nprocs, volper, r); return r; } diff --git a/Src/Base/AMReX_EArena.H b/Src/Base/AMReX_EArena.H index 315de02f87d..1321a98e063 100644 --- a/Src/Base/AMReX_EArena.H +++ b/Src/Base/AMReX_EArena.H @@ -41,7 +41,7 @@ public: std::size_t free_space_available () const noexcept; //! The default memory hunk size to grab from the heap. - enum { DefaultHunkSize = 1024*1024*8 }; + constexpr static std::size_t DefaultHunkSize = 1024*1024*8; protected: diff --git a/Src/Base/AMReX_Extension.H b/Src/Base/AMReX_Extension.H index dd7efaff2ca..b8bcf6e3c69 100644 --- a/Src/Base/AMReX_Extension.H +++ b/Src/Base/AMReX_Extension.H @@ -3,15 +3,6 @@ #if !defined(BL_LANG_FORT) -// HIP FIX HERE - noexcept - -#ifdef AMREX_HIP_PLATFORM_HCC -#define AMREX_NOEXCEPT -#else -#define AMREX_NOEXCEPT noexcept -#endif - - // restrict #ifdef __cplusplus @@ -126,15 +117,44 @@ #define AMREX_INLINE inline #endif +// no inline +#if defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIP__) || defined(__SYCL_COMPILER_VERSION) +#define AMREX_NO_INLINE __attribute__((noinline)) +#else +#define AMREX_NO_INLINE +#endif // __attribute__((weak)) -#ifdef AMREX_TYPECHECK +#if defined(AMREX_TYPECHECK) #define AMREX_ATTRIBUTE_WEAK +#elif defined(_WIN32) +#define AMREX_ATTRIBUTE_WEAK #else #define AMREX_ATTRIBUTE_WEAK __attribute__((weak)) #endif +#if defined(__cplusplus) && defined(_WIN32) +#include +#endif + +#if (__cplusplus >= 201703L) +#define AMREX_FALLTHROUGH [[fallthrough]] +#elif defined(__clang__) +#define AMREX_FALLTHROUGH [[clang::fallthrough]] +#elif defined(__GNUC__) && (__GNUC__ >= 7) && !defined(__INTEL_COMPILER) +#define AMREX_FALLTHROUGH [[gnu::fallthrough]] +#else +#define AMREX_FALLTHROUGH ((void)0) +#endif + #endif /* !BL_LANG_FORT */ #endif + +/* + * DPCPP version strings + * beta08 #define __SYCL_COMPILER_VERSION 20200715 + * beta09 #define __SYCL_COMPILER_VERSION 20200827 + * beta10 #define __SYCL_COMPILER_VERSION 20201005 +*/ diff --git a/Src/Base/AMReX_FACopyDescriptor.H b/Src/Base/AMReX_FACopyDescriptor.H index 1dce8057044..e03697ed251 100644 --- a/Src/Base/AMReX_FACopyDescriptor.H +++ b/Src/Base/AMReX_FACopyDescriptor.H @@ -563,13 +563,13 @@ FabArrayCopyDescriptor::CollectData () // for meta-data Vector md_sender, md_offset, md_icnts, md_bcnts; - int* md_recv_data; + int* md_recv_data = nullptr; Vector md_send_data; Vector md_recv_reqs, md_send_reqs; // for data Vector data_sender, data_offset; - value_type* recv_data; + value_type* recv_data = nullptr; Vector send_data; Vector data_recv_reqs, data_send_reqs; diff --git a/Src/Base/AMReX_FArrayBox.cpp b/Src/Base/AMReX_FArrayBox.cpp index 9323c82710e..1ed44b082b0 100644 --- a/Src/Base/AMReX_FArrayBox.cpp +++ b/Src/Base/AMReX_FArrayBox.cpp @@ -527,7 +527,7 @@ FABio::read_header (std::istream& is, FABio* FABio::read_header (std::istream& is, FArrayBox& f, - int compIndex, + int /*compIndex*/, int& nCompAvailable) { // BL_PROFILE("FArrayBox::read_header_is_i"); @@ -718,9 +718,9 @@ FABio_ascii::skip (std::istream& is, } void -FABio_ascii::skip (std::istream& is, - FArrayBox& f, - int nCompToSkip) const +FABio_ascii::skip (std::istream& /*is*/, + FArrayBox& /*f*/, + int /*nCompToSkip*/) const { amrex::Error("FABio_ascii::skip(..., int nCompToSkip) not implemented"); } @@ -748,7 +748,7 @@ FABio_8bit::write (std::ostream& os, { BL_ASSERT(comp >= 0 && num_comp >= 1 && (comp+num_comp) <= f.nComp()); - const Real eps = 1.0e-8_rt; // FIXME - whats a better value? + const Real eps = 1.0e-8; // FIXME - whats a better value? const Long siz = f.box().numPts(); unsigned char *c = new unsigned char[siz]; diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 6c294fbf68a..69a40df0a67 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -54,19 +54,26 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } nwarps.push_back(ntotwarps); - std::size_t nbytes = ntags*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, tags.data(), nbytes); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; - nbytes = (ntags+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - constexpr int nthreads = 128; + std::memcpy(h_buffer, tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); + + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -112,14 +119,14 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; int g_wid = g_tid / Gpu::Device::warp_size; if (g_wid >= ntotwarps) return; - int tag_id; + int tag_id = -10000; { int lo = 0; int hi = ntags; @@ -159,8 +166,8 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } #endif @@ -196,7 +203,7 @@ struct CellAtomicAdd AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator() (U* d, U s) const noexcept { - Gpu::Atomic::Add(d,s); + Gpu::Atomic::AddNoRet(d,s); } }; @@ -221,19 +228,27 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); + + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); - constexpr int nthreads = 128; + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -280,14 +295,14 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; int g_wid = g_tid / Gpu::Device::warp_size; if (g_wid >= ntotwarps) return; - int tag_id; + int tag_id = -10000; { int lo = 0; int hi = N_locs; @@ -328,8 +343,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template @@ -353,30 +368,38 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t offset_masks = Arena::align(offset_nwarps+sizeof_nwarps); + std::size_t sizeof_masks = masks.size()*sizeof(Array4); + std::size_t total_buf_size = offset_masks + sizeof_masks; + + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + std::memcpy(h_buffer+offset_masks, masks.data(), sizeof_masks); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - nbytes = masks.size()*sizeof(Array4); - auto d_masks = static_cast*>(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_masks, masks.data(), nbytes); + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + auto d_masks = reinterpret_cast*>(d_buffer+offset_masks); - constexpr int nthreads = 128; + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { int g_tid = item.get_global_id(0); int g_wid = g_tid / Gpu::Device::warp_size; if (g_wid >= ntotwarps) return; - int tag_id; + int tag_id = -1; { int lo = 0; int hi = N_locs; @@ -414,10 +437,18 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in int to_try = 1; while (true) { int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0; +#if (__SYCL_COMPILER_VERSION <= 20200827) if (sycl::intel::all_of(item.get_sub_group(), msk == 0)) { // 0 means lock acquired +#else + if (sycl::ONEAPI::all_of(item.get_sub_group(), msk == 0)) { // 0 means lock acquired +#endif break; // all threads have acquired. } else { +#if (__SYCL_COMPILER_VERSION <= 20200827) if (sycl::intel::any_of(item.get_sub_group(), msk > mypriority)) { +#else + if (sycl::ONEAPI::any_of(item.get_sub_group(), msk > mypriority)) { +#endif if (m) *m = 0; // yield item.mem_fence(); // xxxxx DPCPP todo: This is block level, but needs to be device level fence, which is currently a PR in intel/llvm to_try = 1; @@ -437,7 +468,7 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in if (m) *m = 0; }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -515,9 +546,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_masks); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template ::value,int> = 0> @@ -661,16 +691,18 @@ FabArray::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp) } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo{}.setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (is_thread_safe) { @@ -923,7 +955,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n // Is the conditional ever expected false? int launches = 0; for (int send = 0; send < N_snds; ++send) { - if (send_data[send] != nullptr) { + if (send_size[send] > 0) { launches += send_cctc[send]->size(); } } @@ -940,8 +972,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n std::size_t(sizeof(CopyMemory)*launches) ); const int j = sit(); - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) @@ -971,9 +1002,9 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n for (int send = 0; send < N_snds; ++send) { const int j = send; - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -985,6 +1016,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type)); } + amrex::ignore_unused(send_size); BL_ASSERT(dptr <= send_data[j] + send_size[j]); } } @@ -999,7 +1031,7 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int Vector const& recv_data, Vector const& recv_size, Vector const& recv_cctc, - bool is_thread_safe) + bool /*is_thread_safe*/) { const int N_rcvs = recv_cctc.size(); if (N_rcvs == 0) return; @@ -1008,9 +1040,9 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int LayoutData > recv_copy_tags(boxArray(),DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1018,6 +1050,7 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int dptr += tag.dbox.numPts() * ncomp * sizeof(value_type); launches++; } + amrex::ignore_unused(recv_size); BL_ASSERT(dptr <= recv_data[k] + recv_size[k]); } } @@ -1080,23 +1113,35 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int template void FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { + amrex::ignore_unused(send_size); + const int N_snds = send_data.size(); if (N_snds == 0) return; + char* pbuffer = send_data[0]; + std::size_t szbuffer = 0; +#if 0 + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in send_data is pinned. + szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + } +#endif + typedef Array4CopyTag TagType; Vector snd_copy_tags; - // FIX HIP HERE -- Dim3 - Dim3 zero; - zero.x = 0; zero.y = 0; zero.z = 0; for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + std::size_t offset = send_data[j]-send_data[0]; + char* dptr = pbuffer + offset; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1104,16 +1149,24 @@ FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int nc amrex::makeArray4((value_type*)(dptr), tag.sbox, ncomp), src.array(tag.srcIndex), tag.sbox, - zero + Dim3{0,0,0} }); dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type)); } - BL_ASSERT(dptr <= send_data[j] + send_size[j]); + BL_ASSERT(dptr <= pbuffer + offset + send_size[j]); } } detail::fab_to_fab(snd_copy_tags, scomp, 0, ncomp, detail::CellStore()); + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != send_data[0]) { + Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]); + Gpu::synchronize(); + The_Arena()->free(pbuffer); + } } template @@ -1124,9 +1177,25 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, Vector const& recv_cctc, CpOp op, bool is_thread_safe) { + amrex::ignore_unused(recv_size); + const int N_rcvs = recv_cctc.size(); if (N_rcvs == 0) return; + char* pbuffer = recv_data[0]; +#if 0 + std::size_t szbuffer = 0; + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in recv_data is pinned. + szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer); + Gpu::synchronize(); + } +#endif + typedef Array4CopyTag TagType; Vector recv_copy_tags; @@ -1143,9 +1212,10 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + std::size_t offset = recv_data[k]-recv_data[0]; + const char* dptr = pbuffer + offset; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1165,21 +1235,23 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, masks.push_back(maskfabs[li].array()); } } - BL_ASSERT(dptr <= recv_data[k] + recv_size[k]); + BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]); } } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo().setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (op == FabArrayBase::COPY) @@ -1200,6 +1272,12 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, detail::fab_to_fab_atomic_add(recv_copy_tags, 0, dcomp, ncomp, masks); } } + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != recv_data[0]) { + The_Arena()->free(pbuffer); + } } #endif /* AMREX_USE_GPU */ @@ -1207,10 +1285,12 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, template void FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { + amrex::ignore_unused(send_size); + const int N_snds = send_data.size(); if (N_snds == 0) return; @@ -1219,9 +1299,9 @@ FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int nc #endif for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1248,6 +1328,8 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, Vector const& recv_cctc, CpOp op, bool is_thread_safe) { + amrex::ignore_unused(recv_size); + const int N_rcvs = recv_cctc.size(); if (N_rcvs == 0) return; @@ -1258,9 +1340,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, #endif for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1286,9 +1368,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, recv_copy_tags.define(dst.boxArray(),dst.DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { diff --git a/Src/Base/AMReX_FPC.cpp b/Src/Base/AMReX_FPC.cpp index 514120f29b7..97b6ee4637f 100644 --- a/Src/Base/AMReX_FPC.cpp +++ b/Src/Base/AMReX_FPC.cpp @@ -23,7 +23,8 @@ defined(__amd64__) || \ defined(__LITTLE_ENDIAN__) || \ defined(__powerpc__) || \ - defined(powerpc) + defined(powerpc) || \ + defined(_WIN32) #define AMREX_LITTLE_ENDIAN #endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 7ce47fed648..db0b2073d2f 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -47,8 +47,8 @@ namespace amrex { -template ::value>::type > -Long nBytesOwned (T const& t) noexcept { return 0; } +template ::value,int>::type = 0> +Long nBytesOwned (T const&) noexcept { return 0; } template Long nBytesOwned (BaseFab const& fab) noexcept { return fab.nBytesOwned(); } @@ -136,26 +136,12 @@ struct MFInfo { } }; - template - class MFGraph; -#ifdef USE_PERILLA - class Perilla; -#endif - template class FabArray : public FabArrayBase { public: - friend class Action; - friend class AmrTask; - template - friend class MFGraph; -#ifdef USE_PERILLA - friend class Perilla; -#endif - struct FABType { typedef FAB value_type; @@ -270,7 +256,8 @@ public: /** * \brief Return true if the FabArray is well-defined. That is, - * if FABs are allocated for each Box in the BoxArray and the + * the FabArray has a BoxArray and DistributionMapping, the + * FABs are allocated for each Box in the BoxArray and the * sizes of the FABs and the number of components are consistent * with the definition of the FabArray. */ @@ -310,46 +297,46 @@ public: FAB * fabPtr (int K) noexcept; // Here K is global index FAB const* fabPtr (int K) const noexcept; - template ::value>::type > + template ::value,int>::type = 0> void prefetchToHost (const MFIter& mfi) const noexcept; - template ::value>::type > + template ::value,int>::type = 0> void prefetchToDevice (const MFIter& mfi) const noexcept; - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> array (const MFIter& mfi) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type> array (const MFIter& mfi) noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> array (int K) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type> array (int K) noexcept; - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> const_array (const MFIter& mfi) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> const_array (int K) const noexcept; - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> array (const MFIter& mfi, int start_comp) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type> array (const MFIter& mfi, int start_comp) noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> array (int K, int start_comp) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type> array (int K, int start_comp) noexcept; - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> const_array (const MFIter& mfi, int start_comp) const noexcept; // - template ::value>::type > + template ::value,int>::type = 0> Array4::value_type const> const_array (int K, int start_comp) const noexcept; //! Explicitly set the Kth FAB in the FabArray to point to elem. @@ -362,11 +349,11 @@ public: void clear (); //! Set all components in the entire region of each FAB to val. - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val); //! Set all components in the entire region of each FAB to val. - template ::value>::type > + template ::value,int>::type = 0> void operator= (value_type val); /** @@ -374,13 +361,13 @@ public: * each FAB in the FabArray, starting at component comp to val. * Also set the value of nghost boundary cells. */ - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, int comp, int num_comp, @@ -392,14 +379,14 @@ public: * as nghost boundary cells, to val, provided they also intersect * with the Box region. */ - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, const Box& region, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, const Box& region, int comp, @@ -409,10 +396,10 @@ public: * \brief Set all components in the valid region of each FAB in the * FabArray to val, including nghost boundary cells. */ - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, int nghost); - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, const IntVect& nghost); /** @@ -420,50 +407,50 @@ public: * FabArray to val, including nghost boundary cells, that also * intersect the Box region. */ - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, const Box& region, int nghost); - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type val, const Box& region, const IntVect& nghost); - template ::value>::type > + template ::value,int>::type = 0> void abs (int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void abs (int comp, int num_comp, const IntVect& nghost); - template ::value>::type > + template ::value,int>::type = 0> void plus (value_type val, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void plus (value_type val, const Box& region, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void mult (value_type val, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void mult (value_type val, const Box& region, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void invert (value_type numerator, int comp, int num_comp, int nghost = 0); - template ::value>::type > + template ::value,int>::type = 0> void invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost = 0); //! Set all values in the boundary region to val. - template ::value>::type > + template ::value,int>::type = 0> void setBndry (value_type val); //! Set ncomp values in the boundary region, starting at start_comp to val. - template ::value>::type > + template ::value,int>::type = 0> void setBndry (value_type val, int strt_comp, int ncomp); //! Set all values outside the Geometry domain to val. - template ::value>::type > + template ::value,int>::type = 0> void setDomainBndry (value_type val, const Geometry& goem); //! Set ncomp values outside the Geometry domain to val, starting at start_comp. - template ::value>::type > + template ::value,int>::type = 0> void setDomainBndry (value_type val, int strt_comp, int ncomp, const Geometry& goem); /** @@ -653,7 +640,7 @@ public: void FillBoundary_nowait (int scomp, int ncomp, bool cross = false); void FillBoundary_nowait (int scomp, int ncomp, const Periodicity& period, bool cross = false); void FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross = false); - template ::value>::type > + template ::value,int>::type = 0> void FillBoundary_finish (); void FillBoundary_test (); @@ -673,14 +660,14 @@ public: // (including ghost cells outside periodic boundaries) // physbnd : boundary cells outside the domain (excluding periodic boundaries) // interior : interior cells (i.e., valid cells) - template ::value>::type > + template ::value,int>::type = 0> void BuildMask (const Box& phys_domain, const Periodicity& period, value_type covered, value_type notcovered, value_type physbnd, value_type interior); // The following are private functions. But we have to make them public for cuda. - template ::value>::type > + template ::value,int>::type = 0> void FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross, bool enforce_periodicity_only = false); @@ -689,10 +676,10 @@ public: void PC_local_cpu (const CPC& thecpc, FabArray const& src, int scomp, int dcomp, int ncomp, CpOp op); - template ::value>::type > + template ::value,int>::type = 0> void setVal (value_type x, const CommMetaData& thecmd, int scomp, int ncomp); - template ::value>::type > + template ::value,int>::type = 0> LayoutData RecvLayoutMask (const CommMetaData& thecmd); #ifdef AMREX_USE_GPU @@ -731,7 +718,7 @@ public: #endif static void pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -744,7 +731,7 @@ public: #endif static void pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -837,7 +824,6 @@ private: Vector& recv_size, Vector& recv_from, Vector& recv_reqs, - int icomp, int ncomp, int SeqNum); @@ -932,27 +918,31 @@ FabArray::fabPtr (int K) const noexcept } template -template +template ::value,int>::type> void FabArray::prefetchToHost (const MFIter& mfi) const noexcept { #ifdef AMREX_USE_CUDA this->fabPtr(mfi)->prefetchToHost(); +#else + amrex::ignore_unused(mfi); #endif } template -template +template ::value,int>::type> void FabArray::prefetchToDevice (const MFIter& mfi) const noexcept { #ifdef AMREX_USE_CUDA this->fabPtr(mfi)->prefetchToDevice(); +#else + amrex::ignore_unused(mfi); #endif } template -template +template ::value,int>::type> Array4::value_type const> FabArray::array (const MFIter& mfi) const noexcept { @@ -960,7 +950,7 @@ FabArray::array (const MFIter& mfi) const noexcept } template -template +template ::value,int>::type> Array4::value_type> FabArray::array (const MFIter& mfi) noexcept { @@ -968,7 +958,7 @@ FabArray::array (const MFIter& mfi) noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::array (int K) const noexcept { @@ -976,7 +966,7 @@ FabArray::array (int K) const noexcept } template -template +template ::value,int>::type> Array4::value_type> FabArray::array (int K) noexcept { @@ -984,7 +974,7 @@ FabArray::array (int K) noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::const_array (const MFIter& mfi) const noexcept { @@ -992,7 +982,7 @@ FabArray::const_array (const MFIter& mfi) const noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::const_array (int K) const noexcept { @@ -1000,7 +990,7 @@ FabArray::const_array (int K) const noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::array (const MFIter& mfi, int start_comp) const noexcept { @@ -1008,7 +998,7 @@ FabArray::array (const MFIter& mfi, int start_comp) const noexcept } template -template +template ::value,int>::type> Array4::value_type> FabArray::array (const MFIter& mfi, int start_comp) noexcept { @@ -1016,7 +1006,7 @@ FabArray::array (const MFIter& mfi, int start_comp) noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::array (int K, int start_comp) const noexcept { @@ -1024,7 +1014,7 @@ FabArray::array (int K, int start_comp) const noexcept } template -template +template ::value,int>::type> Array4::value_type> FabArray::array (int K, int start_comp) noexcept { @@ -1032,7 +1022,7 @@ FabArray::array (int K, int start_comp) noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::const_array (const MFIter& mfi, int start_comp) const noexcept { @@ -1040,7 +1030,7 @@ FabArray::const_array (const MFIter& mfi, int start_comp) const noexcept } template -template +template ::value,int>::type> Array4::value_type const> FabArray::const_array (int K, int start_comp) const noexcept { @@ -1078,7 +1068,7 @@ FabArray::clear () } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, int nghost) { @@ -1086,7 +1076,7 @@ FabArray::setVal (value_type val, int nghost) } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, const IntVect& nghost) { @@ -1094,7 +1084,7 @@ FabArray::setVal (value_type val, const IntVect& nghost) } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, const Box& region, int nghost) { @@ -1102,7 +1092,7 @@ FabArray::setVal (value_type val, const Box& region, int nghost) } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, const Box& region, const IntVect& nghost) { @@ -1214,6 +1204,8 @@ template bool FabArray::ok () const { + if (!define_function_called) return false; + int isok = 1; for (MFIter fai(*this); fai.isValid() && isok; ++fai) @@ -1448,7 +1440,7 @@ FabArray::setFab (const MFIter& mfi, } template -template +template ::value,int>::type> void FabArray::setBndry (value_type val) { @@ -1456,7 +1448,7 @@ FabArray::setBndry (value_type val) } template -template +template ::value,int>::type> void FabArray::setBndry (value_type val, int strt_comp, @@ -1475,7 +1467,7 @@ FabArray::setBndry (value_type val, } template -template +template ::value,int>::type> void FabArray::setDomainBndry (value_type val, const Geometry& geom) { @@ -1483,7 +1475,7 @@ FabArray::setDomainBndry (value_type val, const Geometry& geom) } template -template +template ::value,int>::type> void FabArray::setDomainBndry (value_type val, int strt_comp, @@ -1544,7 +1536,7 @@ FabArray::copyTo (FAB& dest, } template -template +template ::value,int>::type> void FabArray::setVal (value_type val) { @@ -1552,7 +1544,7 @@ FabArray::setVal (value_type val) } template -template +template ::value,int>::type> void FabArray::operator= (value_type val) { @@ -1560,7 +1552,7 @@ FabArray::operator= (value_type val) } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, int comp, @@ -1571,7 +1563,7 @@ FabArray::setVal (value_type val, } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::setVal (value_type val, int comp, @@ -1590,7 +1582,7 @@ FabArray::setVal (value_type val, { const Box& bx = fai.growntilebox(nghost); auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1598,7 +1590,7 @@ FabArray::setVal (value_type val, } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, const Box& region, @@ -1610,7 +1602,7 @@ FabArray::setVal (value_type val, } template -template // Foo fools nvcc +template ::value,int>::type Z> void FabArray::setVal (value_type val, const Box& region, @@ -1633,7 +1625,7 @@ FabArray::setVal (value_type val, if (b.ok()) { auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1642,7 +1634,7 @@ FabArray::setVal (value_type val, } template -template // FOO fools nvcc +template ::value,int>::type> void FabArray::abs (int comp, int ncomp, int nghost) { @@ -1650,7 +1642,7 @@ FabArray::abs (int comp, int ncomp, int nghost) } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::abs (int comp, int ncomp, const IntVect& nghost) { @@ -1663,7 +1655,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp)); }); @@ -1671,7 +1663,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::plus (value_type val, int comp, int num_comp, int nghost) { @@ -1682,7 +1674,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1690,7 +1682,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::plus (value_type val, const Box& region, int comp, int num_comp, int nghost) { @@ -1702,7 +1694,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1711,7 +1703,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::mult (value_type val, int comp, int num_comp, int nghost) { @@ -1722,7 +1714,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1730,7 +1722,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::mult (value_type val, const Box& region, int comp, int num_comp, int nghost) { @@ -1742,7 +1734,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1751,7 +1743,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { @@ -1762,7 +1754,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1770,7 +1762,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::invert (value_type numerator, const Box& region, int comp, int num_comp, int nghost) { @@ -1782,7 +1774,7 @@ FabArray::invert (value_type numerator, const Box& region, int comp, int nu const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1953,7 +1945,7 @@ FabArray::FillBoundary_nowait (int scomp, int ncomp, const IntVect& nghost, } template -template +template ::value,int>::type Z> void FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, value_type covered, value_type notcovered, @@ -1978,7 +1970,7 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, Box const& fbx = mfi.growntilebox(); Box const& gbx = fbx & domain; Box const& vbx = mfi.validbox(); - AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n, { IntVect iv(AMREX_D_DECL(i,j,k)); if (vbx.contains(iv)) { @@ -1996,15 +1988,15 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, } template -template +template ::value,int>::type> void FabArray::setVal (value_type val, const CommMetaData& thecmd, int scomp, int ncomp) { #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - CMD_local_setVal_gpu(val, thecmd, 0, ncomp); - CMD_remote_setVal_gpu(val, thecmd, 0, ncomp); + CMD_local_setVal_gpu(val, thecmd, scomp, ncomp); + CMD_remote_setVal_gpu(val, thecmd, scomp, ncomp); } else #endif @@ -2017,7 +2009,7 @@ FabArray::setVal (value_type val, const CommMetaData& thecmd, int scomp, in #endif for (int i = 0; i < N_locs; ++i) { const CopyComTag& tag = LocTags[i]; - (*this)[tag.dstIndex].template setVal(val, tag.dbox, 0, ncomp); + (*this)[tag.dstIndex].template setVal(val, tag.dbox, scomp, ncomp); } for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) { @@ -2027,14 +2019,14 @@ FabArray::setVal (value_type val, const CommMetaData& thecmd, int scomp, in #endif for (int i = 0; i < N; ++i) { const CopyComTag& tag = it->second[i]; - (*this)[tag.dstIndex].template setVal(val, tag.dbox, 0, ncomp); + (*this)[tag.dstIndex].template setVal(val, tag.dbox, scomp, ncomp); } } } } template -template +template ::value,int>::type> LayoutData FabArray::RecvLayoutMask (const CommMetaData& thecmd) { diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index 002d2a3586f..631da49e938 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -23,11 +23,6 @@ class Geometry; class FArrayBox; template class FabFactory; template class FabArray; -class AmrTask; -#ifdef USE_PERILLA -class Perilla; -class RegionGraph; -#endif namespace EB2 { class IndexSpace; } @@ -35,11 +30,6 @@ class FabArrayBase { friend class MFIter; friend class MFGhostIter; - friend class AmrTask; -#ifdef USE_PERILLA - friend class Perilla; - friend class RegionGraph; -#endif template friend void FillBoundary (Vector*> const& mf, const Periodicity& period); @@ -143,6 +133,7 @@ public: */ bool is_cell_centered () const noexcept; + void setMultiGhost(bool a_multi_ghost) {m_multi_ghost = a_multi_ghost;} // These are provided for convenience to keep track of how many // ghost cells are up to date. The number of filled ghost cells @@ -324,6 +315,7 @@ public: const Box& dstdomain, const IntVect& dstng, const BoxConverter& coarsener, + const Box& fdomain, const Box& cdomain, const EB2::IndexSpace* index_space); @@ -332,16 +324,16 @@ public: Long bytes () const; BoxArray ba_crse_patch; - DistributionMapping dm_crse_patch; + BoxArray ba_fine_patch; + DistributionMapping dm_patch; std::unique_ptr > fact_crse_patch; - Vector dst_idxs; - Vector dst_boxes; + std::unique_ptr > fact_fine_patch; // BDKey m_srcbdk; BDKey m_dstbdk; Box m_dstdomain; IntVect m_dstng; - BoxConverter* m_coarsener; + std::unique_ptr m_coarsener; // Long m_nuse; }; @@ -355,10 +347,10 @@ public: static const FPinfo& TheFPinfo (const FabArrayBase& srcfa, const FabArrayBase& dstfa, - const Box& dstdomain, const IntVect& dstng, const BoxConverter& coarsener, - const Box& cdomain, + const Geometry& fgeom, + const Geometry& cgeom, const EB2::IndexSpace*); void flushFPinfo (bool no_assertion=false); @@ -466,6 +458,7 @@ public: int n_comp; mutable BDKey m_bdkey; IntVect n_filled; // Note that IntVect is zero by default. + bool m_multi_ghost = false; // // Tiling @@ -501,7 +494,7 @@ public: { FB (const FabArrayBase& fa, const IntVect& nghost, bool cross, const Periodicity& period, - bool enforce_periodicity_only); + bool enforce_periodicity_only, bool multi_ghost = false); ~FB (); IndexType m_typ; @@ -512,6 +505,7 @@ public: Periodicity m_period; // Long m_nuse; + bool m_multi_ghost = false; // #if ( defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 10) ) CudaGraph m_localCopy; diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 2a775f9e792..50e00ad2c9c 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -25,7 +25,7 @@ namespace amrex { // int FabArrayBase::MaxComp; -#if defined(AMREX_USE_GPU) && defined(AMREX_USE_GPU_PRAGMA) +#if defined(AMREX_USE_GPU) #if AMREX_SPACEDIM == 1 IntVect FabArrayBase::mfiter_tile_size(1024000); @@ -116,11 +116,15 @@ FabArrayBase::Initialize () MaxComp = 1; } +#ifdef AMREX_USE_GPU if (ParallelDescriptor::UseGpuAwareMpi()) { - the_fa_arena = The_Device_Arena(); + the_fa_arena = The_Arena(); } else { the_fa_arena = The_Pinned_Arena(); } +#else + the_fa_arena = The_Cpu_Arena(); +#endif amrex::ExecOnFinalize(FabArrayBase::Finalize); @@ -509,6 +513,7 @@ FabArrayBase::CPC::CPC (const BoxArray& ba, const IntVect& ng, void FabArrayBase::flushCPC (bool no_assertion) const { + amrex::ignore_unused(no_assertion); BL_ASSERT(no_assertion || getBDKey() == m_bdkey); std::vector others; @@ -621,11 +626,12 @@ FabArrayBase::getCPC (const IntVect& dstng, const FabArrayBase& src, const IntVe FabArrayBase::FB::FB (const FabArrayBase& fa, const IntVect& nghost, bool cross, const Periodicity& period, - bool enforce_periodicity_only) + bool enforce_periodicity_only, + bool multi_ghost) : m_typ(fa.boxArray().ixType()), m_crse_ratio(fa.boxArray().crseRatio()), m_ngrow(nghost), m_cross(cross), m_epo(enforce_periodicity_only), m_period(period), - m_nuse(0) + m_nuse(0), m_multi_ghost(multi_ghost) { BL_PROFILE("FabArrayBase::FB::FB()"); @@ -646,6 +652,8 @@ FabArrayBase::FB::FB (const FabArrayBase& fa, const IntVect& nghost, void FabArrayBase::FB::define_fb(const FabArrayBase& fa) { + AMREX_ASSERT(m_multi_ghost ? fa.nGrow() >= 2 : true); // must have >= 2 ghost nodes + AMREX_ASSERT(m_multi_ghost ? !m_period.isAnyPeriodic() : true); // this only works for non-periodic const int MyProc = ParallelDescriptor::MyProc(); const BoxArray& ba = fa.boxArray(); const DistributionMapping& dm = fa.DistributionMap(); @@ -656,6 +664,7 @@ FabArrayBase::FB::define_fb(const FabArrayBase& fa) const int nlocal = imap.size(); const IntVect& ng = m_ngrow; + const IntVect ng_ng = m_ngrow - 1; std::vector< std::pair > isects; const std::vector& pshifts = m_period.shiftIntVect(); @@ -666,7 +675,8 @@ FabArrayBase::FB::define_fb(const FabArrayBase& fa) { const int ksnd = imap[i]; const Box& vbx = ba[ksnd]; - + const Box& vbx_ng = amrex::grow(vbx,1); + for (auto pit=pshifts.cbegin(); pit!=pshifts.cend(); ++pit) { ba.intersections(vbx+(*pit), isects, false, ng); @@ -680,7 +690,20 @@ FabArrayBase::FB::define_fb(const FabArrayBase& fa) if (ParallelDescriptor::sameTeam(dst_owner)) { continue; // local copy will be dealt with later } else if (MyProc == dm[ksnd]) { - const BoxList& bl = amrex::boxDiff(bx, ba[krcv]); + BoxList bl = amrex::boxDiff(bx, ba[krcv]); + if (m_multi_ghost) + { + // In the case where ngrow>1, augment the send/rcv box list + // with boxes for overlapping ghost nodes. + const Box& ba_krcv = amrex::grow(ba[krcv],1); + const Box& dst_bx_ng = (amrex::grow(ba_krcv,ng_ng) & (vbx_ng + (*pit))); + const BoxList &bltmp = ba.complementIn(dst_bx_ng); + for (auto const& btmp : bltmp) + { + bl.join(amrex::boxDiff(btmp,ba_krcv)); + } + bl.simplify(); + } for (BoxList::const_iterator lit = bl.begin(); lit != bl.end(); ++lit) send_tags[dst_owner].push_back(CopyComTag(*lit, (*lit)-(*pit), krcv, ksnd)); } @@ -713,6 +736,7 @@ FabArrayBase::FB::define_fb(const FabArrayBase& fa) { const int krcv = imap[i]; const Box& vbx = ba[krcv]; + const Box& vbx_ng = amrex::grow(vbx,1); const Box& bxrcv = amrex::grow(vbx, ng); if (check_local) { @@ -735,7 +759,22 @@ FabArrayBase::FB::define_fb(const FabArrayBase& fa) const Box& dst_bx = isects[j].second - *pit; const int src_owner = dm[ksnd]; - const BoxList& bl = amrex::boxDiff(dst_bx, vbx); + BoxList bl = amrex::boxDiff(dst_bx, vbx); + + if (m_multi_ghost) + { + // In the case where ngrow>1, augment the send/rcv box list + // with boxes for overlapping ghost nodes. + Box ba_ksnd = ba[ksnd]; + ba_ksnd.grow(1); + const Box dst_bx_ng = (ba_ksnd & (bxrcv + (*pit))) - (*pit); + const BoxList &bltmp = ba.complementIn(dst_bx_ng); + for (auto const& btmp : bltmp) + { + bl.join(amrex::boxDiff(btmp,vbx_ng)); + } + bl.simplify(); + } for (BoxList::const_iterator lit = bl.begin(); lit != bl.end(); ++lit) { const Box& blbx = *lit; @@ -1010,6 +1049,7 @@ FabArrayBase::FB::~FB () void FabArrayBase::flushFB (bool no_assertion) const { + amrex::ignore_unused(no_assertion); BL_ASSERT(no_assertion || getBDKey() == m_bdkey); std::pair er_it = m_TheFBCache.equal_range(m_bdkey); for (FBCacheIter it = er_it.first; it != er_it.second; ++it) @@ -1051,6 +1091,7 @@ FabArrayBase::getFB (const IntVect& nghost, const Periodicity& period, it->second->m_crse_ratio == boxArray().crseRatio() && it->second->m_ngrow == nghost && it->second->m_cross == cross && + it->second->m_multi_ghost== m_multi_ghost && it->second->m_epo == enforce_periodicity_only && it->second->m_period == period ) { @@ -1061,7 +1102,7 @@ FabArrayBase::getFB (const IntVect& nghost, const Periodicity& period, } // Have to build a new one - FB* new_fb = new FB(*this, nghost, cross, period, enforce_periodicity_only); + FB* new_fb = new FB(*this, nghost, cross, period, enforce_periodicity_only,m_multi_ghost); #ifdef BL_PROFILE m_FBC_stats.bytes += new_fb->bytes(); @@ -1078,10 +1119,11 @@ FabArrayBase::getFB (const IntVect& nghost, const Periodicity& period, } FabArrayBase::FPinfo::FPinfo (const FabArrayBase& srcfa, - const FabArrayBase& dstfa, - const Box& dstdomain, - const IntVect& dstng, - const BoxConverter& coarsener, + const FabArrayBase& dstfa, + const Box& dstdomain, + const IntVect& dstng, + const BoxConverter& coarsener, + const Box& fdomain, const Box& cdomain, const EB2::IndexSpace* index_space) : m_srcbdk (srcfa.getBDKey()), @@ -1090,89 +1132,192 @@ FabArrayBase::FPinfo::FPinfo (const FabArrayBase& srcfa, m_dstng (dstng), m_coarsener(coarsener.clone()), m_nuse (0) -{ +{ + amrex::ignore_unused(fdomain,cdomain,index_space); BL_PROFILE("FPinfo::FPinfo()"); + const BoxArray& srcba = srcfa.boxArray(); const BoxArray& dstba = dstfa.boxArray(); BL_ASSERT(srcba.ixType() == dstba.ixType()); + BoxArray srcba_simplified = srcba.simplified(); + BoxArray dstba_simplified = dstba.simplified(); + const IndexType& boxtype = dstba.ixType(); BL_ASSERT(boxtype == dstdomain.ixType()); - - BL_ASSERT(dstng.allLE(dstfa.nGrowVect())); - const DistributionMapping& dstdm = dstfa.DistributionMap(); - - const int myproc = ParallelDescriptor::MyProc(); + BL_ASSERT(dstng.allLE(dstfa.nGrowVect())); BoxList bl(boxtype); - Vector iprocs; - - for (int i = 0, N = dstba.size(); i < N; ++i) - { - Box bx = dstba[i]; + const int Ndst = dstba_simplified.size(); + const int nprocs = ParallelContext::NProcsSub(); + int iboxlo, iboxhi; + bool parallel_ci; + if (Ndst > 8) { + parallel_ci = true; + const int navg = Ndst / nprocs; + const int nextra = Ndst - navg*nprocs; + const int myproc = ParallelContext::MyProcSub(); + iboxlo = (myproc < nextra) ? myproc*(navg+1) : myproc*navg+nextra; + iboxhi = (myproc < nextra) ? iboxlo+navg+1-1 : iboxlo+navg-1; + } else { + parallel_ci = false; + iboxlo = 0; + iboxhi = Ndst-1; + } + for (int i = iboxlo; i <= iboxhi; ++i) { + Box bx = dstba_simplified[i]; bx.grow(m_dstng); bx &= m_dstdomain; + BoxList const& leftover = srcba_simplified.complementIn(bx); + if (leftover.isNotEmpty()) { + bl.join(leftover); + } + } - BoxList leftover = srcba.complementIn(bx); + if (parallel_ci) { + amrex::AllGatherBoxes(bl.data()); + } - bool ismybox = (dstdm[i] == myproc); - for (BoxList::const_iterator bli = leftover.begin(); bli != leftover.end(); ++bli) - { - bl.push_back(m_coarsener->doit(*bli)); - if (ismybox) { - dst_boxes.push_back(*bli); - dst_idxs.push_back(i); + if (bl.isEmpty()) return; + + Long ncells_total = 0L; + Long ncells_max = 0L; + for (auto const& b : bl) { + auto n = b.numPts(); + ncells_total += n; + ncells_max = std::max(ncells_max, n); + } + + Long ncells_avg = ncells_total / ParallelContext::NProcsSub(); + Long ncells_target = std::max(2*ncells_avg, Long(8*8*8)); + if (ncells_max > ncells_target) { + BoxList bltmp(boxtype); + Vector& bltmpvec = bltmp.data(); + for (Box const& b : bl) { + Long const npts = b.numPts(); + if (npts <= ncells_target) { + bltmp.push_back(b); + } else { + IntVect const len = b.length(); + IntVect numblk{1}; + while (npts > (AMREX_D_TERM(numblk[0],*numblk[1],*numblk[2])) * ncells_target) { +#if (AMREX_SPACEDIM == 3) + int longdir = (len[2] >= len[0] && len[2] >= len[1]) ? 2 : + (len[1] >= len[0]) ? 1 : 0; +#elif (AMREX_SPACEDIM == 2) + int longdir = (len[1] >= len[0]) ? 1 : 0; +#elif (AMREX_SPACEDIM == 1) + int longdir = 0; +#else + static_assert(false, "FabArrayBase::FPinfo() unsupported AMREX_SPACEDIM"); +#endif + numblk[longdir] *= 2; + } + numblk.min(len); + IntVect sz, extra; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + sz[idim] = len[idim] / numblk[idim]; + extra[idim] = len[idim] - sz[idim] * numblk[idim]; + } + if (numblk == 1) { + bltmp.push_back(b); + } else { + IntVect const& boxlo = b.smallEnd(); +#if (AMREX_SPACEDIM == 3) + for (int k = 0; k < numblk[2]; ++k) { + int klo = (k < extra[2]) ? k*(sz[2]+1) : (k*sz[2]+extra[2]); + int khi = (k < extra[2]) ? klo+(sz[2]+1)-1 : klo+sz[2]-1; + klo += boxlo[2]; + khi += boxlo[2]; +#endif +#if (AMREX_SPACEDIM >= 2) + for (int j = 0; j < numblk[1]; ++j) { + int jlo = (j < extra[1]) ? j*(sz[1]+1) : (j*sz[1]+extra[1]); + int jhi = (j < extra[1]) ? jlo+(sz[1]+1)-1 : jlo+sz[1]-1; + jlo += boxlo[1]; + jhi += boxlo[1]; +#endif + for (int i = 0; i < numblk[0]; ++i) { + int ilo = (i < extra[0]) ? i*(sz[0]+1) : (i*sz[0]+extra[0]); + int ihi = (i < extra[0]) ? ilo+(sz[0]+1)-1 : ilo+sz[0]-1; + ilo += boxlo[0]; + ihi += boxlo[0]; + bltmpvec.emplace_back(IntVect(AMREX_D_DECL(ilo,jlo,klo)), + IntVect(AMREX_D_DECL(ihi,jhi,khi)), + boxtype); + AMREX_D_TERM(},},}) + } } - iprocs.push_back(dstdm[i]); } + std::swap(bl,bltmp); + } + + BoxList blcrse(boxtype); + blcrse.reserve(bl.size()); + for (auto const& b : bl) { + blcrse.push_back(coarsener.doit(b)); } - if (!iprocs.empty()) { - ba_crse_patch.define(bl); - dm_crse_patch.define(std::move(iprocs)); + ba_crse_patch.define(std::move(blcrse)); + ba_fine_patch.define(std::move(bl)); + dm_patch.KnapSackProcessorMap(ba_fine_patch, ParallelContext::NProcsSub()); + #ifdef AMREX_USE_EB - if (index_space) - { - fact_crse_patch = makeEBFabFactory(index_space, - index_space->getGeometry(cdomain), - ba_crse_patch, - dm_crse_patch, - {0,0,0}, EBSupport::basic); - } - else + if (index_space) + { + fact_crse_patch = makeEBFabFactory(index_space, + index_space->getGeometry(cdomain), + ba_crse_patch, + dm_patch, + {0,0,0}, EBSupport::basic); + int ng = boxtype.cellCentered() ? 0 : 1; // to avoid dengerate box + fact_fine_patch = makeEBFabFactory(index_space, + index_space->getGeometry(fdomain), + ba_fine_patch, + dm_patch, + {ng,ng,ng}, EBSupport::basic); + } + else #endif - { - fact_crse_patch.reset(new FArrayBoxFactory()); - } + { + fact_crse_patch.reset(new FArrayBoxFactory()); + fact_fine_patch.reset(new FArrayBoxFactory()); } } FabArrayBase::FPinfo::~FPinfo () { - delete m_coarsener; } Long FabArrayBase::FPinfo::bytes () const { Long cnt = sizeof(FabArrayBase::FPinfo); - cnt += sizeof(Box) * (ba_crse_patch.capacity() + dst_boxes.capacity()); - cnt += sizeof(int) * (dm_crse_patch.capacity() + dst_idxs.capacity()); + cnt += sizeof(Box) * (ba_crse_patch.capacity() + ba_fine_patch.capacity()); + cnt += sizeof(int) * dm_patch.capacity(); return cnt; } const FabArrayBase::FPinfo& FabArrayBase::TheFPinfo (const FabArrayBase& srcfa, const FabArrayBase& dstfa, - const Box& dstdomain, const IntVect& dstng, const BoxConverter& coarsener, - const Box& cdomain, + const Geometry& fgeom, + const Geometry& cgeom, const EB2::IndexSpace* index_space) { BL_PROFILE("FabArrayBase::TheFPinfo()"); + Box dstdomain = fgeom.Domain(); + dstdomain.convert(dstfa.boxArray().ixType()); + for (int i = 0; i < AMREX_SPACEDIM; ++i) { + if (fgeom.isPeriodic(i)) { + dstdomain.grow(i,dstng[i]); + } + } + const BDKey& srckey = srcfa.getBDKey(); const BDKey& dstkey = dstfa.getBDKey(); @@ -1194,7 +1339,8 @@ FabArrayBase::TheFPinfo (const FabArrayBase& srcfa, } // Have to build a new one - FPinfo* new_fpc = new FPinfo(srcfa, dstfa, dstdomain, dstng, coarsener, cdomain, index_space); + FPinfo* new_fpc = new FPinfo(srcfa, dstfa, dstdomain, dstng, coarsener, + fgeom.Domain(), cgeom.Domain(), index_space); #ifdef AMREX_MEM_PROFILING m_FPinfo_stats.bytes += new_fpc->bytes(); @@ -1215,6 +1361,7 @@ FabArrayBase::TheFPinfo (const FabArrayBase& srcfa, void FabArrayBase::flushFPinfo (bool no_assertion) { + amrex::ignore_unused(no_assertion); BL_ASSERT(no_assertion || getBDKey() == m_bdkey); std::vector others; @@ -1374,6 +1521,7 @@ FabArrayBase::TheCFinfo (const FabArrayBase& finefa, void FabArrayBase::flushCFinfo (bool no_assertion) { + amrex::ignore_unused(no_assertion); BL_ASSERT(no_assertion || getBDKey() == m_bdkey); auto er_it = m_TheCrseFineCache.equal_range(m_bdkey); for (auto it = er_it.first; it != er_it.second; ++it) @@ -1554,6 +1702,7 @@ FabArrayBase::buildTileArray (const IntVect& tileSize, TileArray& ta) const void FabArrayBase::flushTileArray (const IntVect& tileSize, bool no_assertion) const { + amrex::ignore_unused(no_assertion); BL_ASSERT(no_assertion || getBDKey() == m_bdkey); TACache& tao = m_TheTileArrayCache; @@ -1658,6 +1807,7 @@ FabArrayBase::WaitForAsyncSends (int N_snds, Vector& send_data, Vector& stats) { + amrex::ignore_unused(send_data); #ifdef BL_USE_MPI BL_ASSERT(N_snds > 0); @@ -1667,6 +1817,8 @@ FabArrayBase::WaitForAsyncSends (int N_snds, BL_ASSERT(send_data.size() == N_snds); ParallelDescriptor::Waitall(send_reqs, stats); +#else + amrex::ignore_unused(N_snds,send_reqs,stats); #endif /*BL_USE_MPI*/ } @@ -1680,8 +1832,8 @@ FabArrayBase::CheckRcvStats(Vector& recv_stats, { for (int i = 0, n = recv_size.size(); i < n; ++i) { if (recv_size[i] > 0) { - std::size_t count; - int tmp_count; + std::size_t count = 0; + int tmp_count = 0; const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]); if (comm_data_type == 1) { diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H index 88174cbb4b9..a0f05f93d24 100644 --- a/Src/Base/AMReX_FabArrayCommI.H +++ b/Src/Base/AMReX_FabArrayCommI.H @@ -3,7 +3,7 @@ #include template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, const Periodicity& period, bool cross, @@ -83,7 +83,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, if (N_rcvs > 0) { PostRcvs(*TheFB.m_RcvTags, fb_the_recv_data, fb_recv_data, fb_recv_size, fb_recv_from, fb_recv_reqs, - scomp, ncomp, SeqNum); + ncomp, SeqNum); fb_recv_stat.resize(N_rcvs); } @@ -118,7 +118,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, std::size_t nbytes = 0; for (auto const& cct : kv.second) { - nbytes += (*this)[cct.srcIndex].nBytes(cct.sbox,scomp,ncomp); + nbytes += (*this)[cct.srcIndex].nBytes(cct.sbox,ncomp); } std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); @@ -143,9 +143,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } else { the_send_data = nullptr; @@ -231,7 +229,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, } template -template // FOO fools nvcc +template ::value,int>::type Z> void FabArray::FillBoundary_finish () { @@ -439,7 +437,7 @@ FabArray::ParallelCopy (const FabArray& src, int actual_n_rcvs = 0; if (N_rcvs > 0) { PostRcvs(*thecpc.m_RcvTags, the_recv_data, - recv_data, recv_size, recv_from, recv_reqs, SC, NC, SeqNum); + recv_data, recv_size, recv_from, recv_reqs, NC, SeqNum); actual_n_rcvs = N_rcvs - std::count(recv_size.begin(), recv_size.end(), 0); } @@ -471,7 +469,7 @@ FabArray::ParallelCopy (const FabArray& src, std::size_t nbytes = 0; for (auto const& cct : kv.second) { - nbytes += src[cct.srcIndex].nBytes(cct.sbox,SC,NC); + nbytes += src[cct.srcIndex].nBytes(cct.sbox,NC); } std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); @@ -495,9 +493,7 @@ FabArray::ParallelCopy (const FabArray& src, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } @@ -701,7 +697,6 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& m_RcvTags, Vector& recv_size, Vector& recv_from, Vector& recv_reqs, - int icomp, int ncomp, int SeqNum) { @@ -717,7 +712,7 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& m_RcvTags, std::size_t nbytes = 0; for (auto const& cct : kv.second) { - nbytes += (*this)[cct.dstIndex].nBytes(cct.dbox,icomp,ncomp); + nbytes += (*this)[cct.dstIndex].nBytes(cct.dbox,ncomp); } std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes); @@ -750,9 +745,9 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& m_RcvTags, for (int i = 0; i < nrecv; ++i) { + recv_data[i] = the_recv_data + offset[i]; if (recv_size[i] > 0) { - recv_data[i] = the_recv_data + offset[i]; const int rank = ParallelContext::global_to_local_rank(recv_from[i]); const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]); if (comm_data_type == 1) { diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index 654b41956f7..869fd172d89 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -12,7 +12,7 @@ template ::value> > typename FAB::value_type ReduceSum (FabArray const& fa, int nghost, F&& f) { - return ReduceSum(fa, IntVect(nghost), std::move(f)); + return ReduceSum(fa, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -75,13 +75,14 @@ template amrex::EnableIf_t::value, typename FAB::value_type> ReduceSum_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { - return ReduceSum_host(fa,nghost,std::move(f)); + return ReduceSum_host(fa,nghost,std::forward(f)); } template amrex::EnableIf_t::value, typename FAB::value_type> ReduceSum_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa,nghost,f); amrex::Abort("ReduceSum: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -93,9 +94,9 @@ typename FAB::value_type ReduceSum (FabArray const& fa, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceSum_device(fa, nghost, std::move(f)); + return fudetail::ReduceSum_device(fa, nghost, std::forward(f)); } else { - return fudetail::ReduceSum_host_wrapper(fa, nghost, std::move(f)); + return fudetail::ReduceSum_host_wrapper(fa, nghost, std::forward(f)); } } #else @@ -104,7 +105,7 @@ template const& fa, IntVect const& nghost, F&& f) { - return fudetail::ReduceSum_host(fa, nghost, std::move(f)); + return fudetail::ReduceSum_host(fa, nghost, std::forward(f)); } #endif @@ -113,7 +114,7 @@ template const& fa1, FabArray const& fa2, int nghost, F&& f) { - return ReduceSum(fa1, fa2, IntVect(nghost), std::move(f)); + return ReduceSum(fa1, fa2, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -152,11 +153,13 @@ ReduceSum_device (FabArray const& fa1, FabArray const& fa2, using value_type = typename FAB1::value_type; value_type sm = 0; + BL_PROFILE("ReduceSum_device"); + { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; - + Gpu::FuseReductionSafeGuard rsg(true); for (MFIter mfi(fa1); mfi.isValid(); ++mfi) { const Box& bx = amrex::grow(mfi.validbox(),nghost); @@ -181,7 +184,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceSum_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return ReduceSum_host(fa1,fa2,nghost,std::move(f)); + return ReduceSum_host(fa1,fa2,nghost,std::forward(f)); } template @@ -189,6 +192,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceSum_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,nghost,f); amrex::Abort("ReduceSum: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -201,9 +205,9 @@ ReduceSum (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceSum_device(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceSum_device(fa1,fa2,nghost,std::forward(f)); } else { - return fudetail::ReduceSum_host_wrapper(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceSum_host_wrapper(fa1,fa2,nghost,std::forward(f)); } } #else @@ -213,7 +217,7 @@ typename FAB1::value_type ReduceSum (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceSum_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceSum_host(fa1,fa2,nghost,std::forward(f)); } #endif @@ -223,7 +227,7 @@ typename FAB1::value_type ReduceSum (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, int nghost, F&& f) { - return ReduceSum(fa1, fa2, fa3, IntVect(nghost), std::move(f)); + return ReduceSum(fa1, fa2, fa3, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -293,7 +297,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceSum_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceSum_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceSum_host(fa1,fa2,fa3,nghost,std::forward(f)); } template @@ -301,6 +305,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceSum_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,fa3,nghost,f); amrex::Abort("ReduceSum: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -313,9 +318,9 @@ ReduceSum (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceSum_device(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceSum_device(fa1,fa2,fa3,nghost,std::forward(f)); } else { - return fudetail::ReduceSum_host_wrapper(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceSum_host_wrapper(fa1,fa2,fa3,nghost,std::forward(f)); } } #else @@ -325,7 +330,7 @@ typename FAB1::value_type ReduceSum (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceSum_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceSum_host(fa1,fa2,fa3,nghost,std::forward(f)); } #endif @@ -334,7 +339,7 @@ template const& fa, int nghost, F&& f) { - return ReduceMin(fa, IntVect(nghost), std::move(f)); + return ReduceMin(fa, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -398,13 +403,14 @@ template amrex::EnableIf_t::value, typename FAB::value_type> ReduceMin_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { - return ReduceMin_host(fa,nghost,std::move(f)); + return ReduceMin_host(fa,nghost,std::forward(f)); } template amrex::EnableIf_t::value, typename FAB::value_type> ReduceMin_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa,nghost,f); amrex::Abort("ReduceMin: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -416,9 +422,9 @@ typename FAB::value_type ReduceMin (FabArray const& fa, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMin_device(fa, nghost, std::move(f)); + return fudetail::ReduceMin_device(fa, nghost, std::forward(f)); } else { - return fudetail::ReduceMin_host_wrapper(fa, nghost, std::move(f)); + return fudetail::ReduceMin_host_wrapper(fa, nghost, std::forward(f)); } } #else @@ -427,7 +433,7 @@ template const& fa, IntVect const& nghost, F&& f) { - return fudetail::ReduceMin_host(fa, nghost, std::move(f)); + return fudetail::ReduceMin_host(fa, nghost, std::forward(f)); } #endif @@ -436,7 +442,7 @@ template const& fa1, FabArray const& fa2, int nghost, F&& f) { - return ReduceMin(fa1, fa2, IntVect(nghost), std::move(f)); + return ReduceMin(fa1, fa2, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -506,7 +512,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceMin_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceMin_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMin_host(fa1,fa2,nghost,std::forward(f)); } template @@ -514,6 +520,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceMin_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,nghost,f); amrex::Abort("ReduceMin: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -526,9 +533,9 @@ ReduceMin (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMin_device(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMin_device(fa1,fa2,nghost,std::forward(f)); } else { - return fudetail::ReduceMin_host_wrapper(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMin_host_wrapper(fa1,fa2,nghost,std::forward(f)); } } #else @@ -538,7 +545,7 @@ typename FAB1::value_type ReduceMin (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceMin_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMin_host(fa1,fa2,nghost,std::forward(f)); } #endif @@ -548,7 +555,7 @@ typename FAB1::value_type ReduceMin (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, int nghost, F&& f) { - return ReduceMin(fa1, fa2, fa3, IntVect(nghost), std::move(f)); + return ReduceMin(fa1, fa2, fa3, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -620,7 +627,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceMin_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceMin_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMin_host(fa1,fa2,fa3,nghost,std::forward(f)); } template @@ -628,6 +635,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceMin_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,fa3,nghost,f); amrex::Abort("ReduceMin: Launch Region is off. Device lambda lambda cannot be called by host."); return 0; } @@ -640,9 +648,9 @@ ReduceMin (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMin_device(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMin_device(fa1,fa2,fa3,nghost,std::forward(f)); } else { - return fudetail::ReduceMin_host_wrapper(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMin_host_wrapper(fa1,fa2,fa3,nghost,std::forward(f)); } } #else @@ -652,7 +660,7 @@ typename FAB1::value_type ReduceMin (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceMin_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMin_host(fa1,fa2,fa3,nghost,std::forward(f)); } #endif @@ -661,7 +669,7 @@ template const& fa, int nghost, F&& f) { - return ReduceMax(fa, IntVect(nghost), std::move(f)); + return ReduceMax(fa, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -726,13 +734,14 @@ template amrex::EnableIf_t::value, typename FAB::value_type> ReduceMax_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { - return ReduceMax_host(fa,nghost,std::move(f)); + return ReduceMax_host(fa,nghost,std::forward(f)); } template amrex::EnableIf_t::value, typename FAB::value_type> ReduceMax_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa,nghost,f); amrex::Abort("ReduceMax: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -744,9 +753,9 @@ typename FAB::value_type ReduceMax (FabArray const& fa, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMax_device(fa,nghost,std::move(f)); + return fudetail::ReduceMax_device(fa,nghost,std::forward(f)); } else { - return fudetail::ReduceMax_host_wrapper(fa,nghost,std::move(f)); + return fudetail::ReduceMax_host_wrapper(fa,nghost,std::forward(f)); } } #else @@ -755,7 +764,7 @@ template const& fa, IntVect const& nghost, F&& f) { - return fudetail::ReduceMax_host(fa,nghost,std::move(f)); + return fudetail::ReduceMax_host(fa,nghost,std::forward(f)); } #endif @@ -764,7 +773,7 @@ template const& fa1, FabArray const& fa2, int nghost, F&& f) { - return ReduceMax(fa1, fa2, IntVect(nghost), std::move(f)); + return ReduceMax(fa1, fa2, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -834,7 +843,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceMax_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return ReduceMax_host(fa1,fa2,nghost,std::move(f)); + return ReduceMax_host(fa1,fa2,nghost,std::forward(f)); } template @@ -842,6 +851,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceMax_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,nghost,f); amrex::Abort("ReduceMax: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -854,9 +864,9 @@ ReduceMax (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMax_device(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMax_device(fa1,fa2,nghost,std::forward(f)); } else { - return fudetail::ReduceMax_host_wrapper(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMax_host_wrapper(fa1,fa2,nghost,std::forward(f)); } } #else @@ -866,7 +876,7 @@ typename FAB1::value_type ReduceMax (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceMax_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceMax_host(fa1,fa2,nghost,std::forward(f)); } #endif @@ -876,7 +886,7 @@ typename FAB1::value_type ReduceMax (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, int nghost, F&& f) { - return ReduceMax(fa1, fa2, fa3, IntVect(nghost), std::move(f)); + return ReduceMax(fa1, fa2, fa3, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -948,7 +958,7 @@ amrex::EnableIf_t::value, typename FAB1::va ReduceMax_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceMax_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMax_host(fa1,fa2,fa3,nghost,std::forward(f)); } template @@ -956,6 +966,7 @@ amrex::EnableIf_t::value, typename FAB1::val ReduceMax_host_wrapper (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,fa3,nghost,f); amrex::Abort("ReduceMax: Launch Region is off. Device lambda lambda cannot be called by host."); return 0; } @@ -968,9 +979,9 @@ ReduceMax (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceMax_device(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMax_device(fa1,fa2,fa3,nghost,std::forward(f)); } else { - return fudetail::ReduceMax_host_wrapper(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMax_host_wrapper(fa1,fa2,fa3,nghost,std::forward(f)); } } #else @@ -980,7 +991,7 @@ typename FAB1::value_type ReduceMax (FabArray const& fa1, FabArray const& fa2, FabArray const& fa3, IntVect const& nghost, F&& f) { - return fudetail::ReduceMax_host(fa1,fa2,fa3,nghost,std::move(f)); + return fudetail::ReduceMax_host(fa1,fa2,fa3,nghost,std::forward(f)); } #endif @@ -989,7 +1000,7 @@ template const& fa, int nghost, F&& f) { - return ReduceLogicalAnd(fa, IntVect(nghost), std::move(f)); + return ReduceLogicalAnd(fa, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -1051,13 +1062,14 @@ template amrex::EnableIf_t::value, bool> ReduceLogicalAnd_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { - return ReduceLogicalAnd_host(fa,nghost,std::move(f)); + return ReduceLogicalAnd_host(fa,nghost,std::forward(f)); } template amrex::EnableIf_t::value, bool> ReduceLogicalAnd_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa,nghost,f); amrex::Abort("ReduceLogicalAnd: Launch Region is off. Device lambda cannot be called by host."); return false; } @@ -1069,9 +1081,9 @@ bool ReduceLogicalAnd (FabArray const& fa, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceLogicalAnd_device(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_device(fa,nghost,std::forward(f)); } else { - return fudetail::ReduceLogicalAnd_host_wrapper(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_host_wrapper(fa,nghost,std::forward(f)); } } #else @@ -1080,7 +1092,7 @@ template const& fa, IntVect const& nghost, F&& f) { - return fudetail::ReduceLogicalAnd_host(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_host(fa,nghost,std::forward(f)); } #endif @@ -1090,7 +1102,7 @@ bool ReduceLogicalAnd (FabArray const& fa1, FabArray const& fa2, int nghost, F&& f) { - return ReduceLogicalAnd(fa1, fa2, IntVect(nghost), std::move(f)); + return ReduceLogicalAnd(fa1, fa2, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -1157,7 +1169,7 @@ amrex::EnableIf_t::value, bool> ReduceLogicalAnd_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return ReduceLogicalAnd_host(fa1,fa2,nghost,std::move(f)); + return ReduceLogicalAnd_host(fa1,fa2,nghost,std::forward(f)); } template @@ -1165,6 +1177,7 @@ amrex::EnableIf_t::value, bool> ReduceLogicalAnd_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,nghost,f); amrex::Abort("ReduceLogicalAnd: Luanch Region is off. Device lambda cannot be called by host."); return false; } @@ -1177,9 +1190,9 @@ ReduceLogicalAnd (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceLogicalAnd_device(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_device(fa1,fa2,nghost,std::forward(f)); } else { - return fudetail::ReduceLogicalAnd_host_wrapper(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_host_wrapper(fa1,fa2,nghost,std::forward(f)); } } #else @@ -1189,7 +1202,7 @@ bool ReduceLogicalAnd (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceLogicalAnd_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalAnd_host(fa1,fa2,nghost,std::forward(f)); } #endif @@ -1198,7 +1211,7 @@ template const& fa, int nghost, F&& f) { - return ReduceLogicalOr(fa, IntVect(nghost), std::move(f)); + return ReduceLogicalOr(fa, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -1260,13 +1273,14 @@ template amrex::EnableIf_t::value, bool> ReduceLogicalOr_host_wrapper (FabArray const& fa, IntVect const& nghost, F&& f) { - return ReduceLogicalOr_host(fa,nghost,std::move(f)); + return ReduceLogicalOr_host(fa,nghost,std::forward(f)); } template amrex::EnableIf_t::value, bool> ReduceLogicalOr_host (FabArray const& fa, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa,nghost,f); amrex::Abort("ReduceLogicalOr: Launch Region is off. Device lambda cannot be called by host."); return 0; } @@ -1278,9 +1292,9 @@ bool ReduceLogicalOr (FabArray const& fa, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceLogicalOr_device(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_device(fa,nghost,std::forward(f)); } else { - return fudetail::ReduceLogicalOr_host_wrapper(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_host_wrapper(fa,nghost,std::forward(f)); } } #else @@ -1289,7 +1303,7 @@ template const& fa, IntVect const& nghost, F&& f) { - return fudetail::ReduceLogicalOr_host(fa,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_host(fa,nghost,std::forward(f)); } #endif @@ -1299,7 +1313,7 @@ bool ReduceLogicalOr (FabArray const& fa1, FabArray const& fa2, int nghost, F&& f) { - return ReduceLogicalOr(fa1, fa2, IntVect(nghost), std::move(f)); + return ReduceLogicalOr(fa1, fa2, IntVect(nghost), std::forward(f)); } namespace fudetail { @@ -1350,6 +1364,7 @@ ReduceLogicalOr_device (FabArray const& fa1, FabArray const& fa2, [=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple { int tr = f(b, arr1, arr2); + return {tr}; }); } @@ -1365,7 +1380,7 @@ amrex::EnableIf_t::value, bool> ReduceLogicalOr_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceLogicalOr_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_host(fa1,fa2,nghost,std::forward(f)); } template @@ -1373,6 +1388,7 @@ amrex::EnableIf_t::value, bool> ReduceLogicalOr_host_wrapper (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { + amrex::ignore_unused(fa1,fa2,nghost,f); amrex::Abort("ReeuceLogicalOr: Launch Region is off. Device lambda cannot be called by host."); return false; } @@ -1385,9 +1401,9 @@ ReduceLogicalOr (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { if (Gpu::inLaunchRegion()) { - return fudetail::ReduceLogicalOr_device(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_device(fa1,fa2,nghost,std::forward(f)); } else { - return fudetail::ReduceLogicalOr_host_wrapper(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_host_wrapper(fa1,fa2,nghost,std::forward(f)); } } #else @@ -1397,7 +1413,7 @@ bool ReduceLogicalOr (FabArray const& fa1, FabArray const& fa2, IntVect const& nghost, F&& f) { - return fudetail::ReduceLogicalOr_host(fa1,fa2,nghost,std::move(f)); + return fudetail::ReduceLogicalOr_host(fa1,fa2,nghost,std::forward(f)); } #endif @@ -1453,7 +1469,7 @@ Add (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp); }); @@ -1485,7 +1501,7 @@ Copy (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, in { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n); }); @@ -1517,7 +1533,7 @@ Subtract (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp); }); @@ -1549,7 +1565,7 @@ Multiply (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp); }); @@ -1581,7 +1597,7 @@ Divide (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp); }); @@ -1592,29 +1608,28 @@ Divide (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, template ::value> > void -Abs (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int numcomp, int nghost) +Abs (FabArray& fa, int icomp, int numcomp, int nghost) { - Abs(dst,src,srccomp,dstcomp,numcomp,IntVect(nghost)); + Abs(fa,icomp,numcomp,IntVect(nghost)); } template ::value> > void -Abs (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int numcomp, const IntVect& nghost) +Abs (FabArray& fa, int icomp, int numcomp, const IntVect& nghost) { #ifdef _OPENMP #pragma omp parallel if (Gpu::notInLaunchRegion()) #endif - for (MFIter mfi(dst,TilingIfNotGPU()); mfi.isValid(); ++mfi) + for (MFIter mfi(fa,TilingIfNotGPU()); mfi.isValid(); ++mfi) { const Box& bx = mfi.growntilebox(nghost); if (bx.ok()) { - auto const srcFab = src.array(mfi); - auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + auto const& fab = fa.array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { - dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp); + fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp)); }); } } @@ -1630,6 +1645,8 @@ prefetchToHost (FabArray const& fa, const bool synchronous = true) fa.prefetchToHost(mfi); } } +#else + amrex::ignore_unused(fa,synchronous); #endif } @@ -1643,6 +1660,8 @@ prefetchToDevice (FabArray const& fa, const bool synchronous = true) fa.prefetchToDevice(mfi); } } +#else + amrex::ignore_unused(fa,synchronous); #endif } @@ -1665,7 +1684,7 @@ OverrideSync (FabArray & fa, FabArray const& msk, const Periodicity& const Box& bx = mfi.tilebox(); auto fab = fa.array(mfi); auto const ifab = msk.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { if (!ifab(i,j,k)) fab(i,j,k,n) = 0; }); @@ -1690,7 +1709,7 @@ dtoh_memcpy (FabArray& dst, FabArray const& src, for (MFIter mfi(dst); mfi.isValid(); ++mfi) { void* pdst = dst[mfi].dataPtr(dcomp); void const* psrc = src[mfi].dataPtr(scomp); - Gpu::dtoh_memcpy_async(pdst, psrc, dst[mfi].nBytes(mfi.fabbox(), dcomp, ncomp)); + Gpu::dtoh_memcpy_async(pdst, psrc, dst[mfi].nBytes(mfi.fabbox(), ncomp)); } #else Copy(dst, src, scomp, dcomp, ncomp, dst.nGrowVect()); @@ -1715,7 +1734,7 @@ htod_memcpy (FabArray& dst, FabArray const& src, for (MFIter mfi(dst); mfi.isValid(); ++mfi) { void* pdst = dst[mfi].dataPtr(dcomp); void const* psrc = src[mfi].dataPtr(scomp); - Gpu::htod_memcpy_async(pdst, psrc, dst[mfi].nBytes(mfi.fabbox(), dcomp, ncomp)); + Gpu::htod_memcpy_async(pdst, psrc, dst[mfi].nBytes(mfi.fabbox(), ncomp)); } #else Copy(dst, src, scomp, dcomp, ncomp, dst.nGrowVect()); @@ -1729,6 +1748,84 @@ htod_memcpy (FabArray& dst, FabArray const& src) htod_memcpy(dst, src, 0, 0, dst.nComp()); } +template ::value> > +IntVect +indexFromValue (FabArray const& mf, int comp, IntVect const& nghost, + typename FAB::value_type value) +{ + IntVect loc; + +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + int tmp[1+AMREX_SPACEDIM] = {0}; + amrex::Gpu::AsyncArray aa(tmp, 1+AMREX_SPACEDIM); + int* p = aa.data(); + // This is a device ptr to 1+AMREX_SPACEDIM int zeros. + // The first is used as an atomic bool and the others for intvect. + for (MFIter mfi(mf); mfi.isValid(); ++mfi) { + const Box& bx = amrex::grow(mfi.validbox(), nghost); + auto const& arr = mf.const_array(mfi); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + int* flag = p; + if (*flag == 0) { + if (arr(i,j,k,comp) == value) { + if (Gpu::Atomic::Exch(flag,1) == 0) { + AMREX_D_TERM(p[1] = i;, + p[2] = j;, + p[3] = k;); + } + } + } + }); + } + aa.copyToHost(tmp, 1+AMREX_SPACEDIM); + AMREX_D_TERM(loc[0] = tmp[1];, + loc[1] = tmp[2];, + loc[2] = tmp[3];); + } + else +#endif + { + bool f = false; +#ifdef _OPENMP +#pragma omp parallel +#endif + { + IntVect priv_loc = IntVect::TheMinVector(); + for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.growntilebox(nghost); + auto const& fab = mf.const_array(mfi); + AMREX_LOOP_3D(bx, i, j, k, + { + if (fab(i,j,k,comp) == value) priv_loc = IntVect(AMREX_D_DECL(i,j,k)); + }); + } + + if (priv_loc.allGT(IntVect::TheMinVector())) { + bool old; +// we should be able to test on _OPENMP < 201107 for capture (version 3.1) +// but we must work around a bug in gcc < 4.9 +#if defined(_OPENMP) && _OPENMP < 201307 // OpenMP 4.0 +#pragma omp critical (amrex_indexfromvalue) +#elif defined(_OPENMP) +#pragma omp atomic capture +#endif + { + old = f; + f = true; + } + + if (old == false) loc = priv_loc; + } + } + } + + return loc; +} + } #endif diff --git a/Src/Base/AMReX_FabConv.cpp b/Src/Base/AMReX_FabConv.cpp index 025a5b7cd6f..0f750aa6b28 100644 --- a/Src/Base/AMReX_FabConv.cpp +++ b/Src/Base/AMReX_FabConv.cpp @@ -221,7 +221,7 @@ selectOrdering (int prec, RealDescriptor* RealDescriptor::newRealDescriptor (int iot, int prec, - const char* sys, + const char* /*sys*/, int ordering) { RealDescriptor* rd = 0; @@ -239,6 +239,8 @@ RealDescriptor::newRealDescriptor (int iot, case FABio::FAB_DOUBLE: rd = new RealDescriptor(FPC::ieee_double, ord, 8); return rd; + default: + return rd; } } case FABio::FAB_NATIVE: @@ -498,7 +500,7 @@ _pd_reorder (char* arr, const int* ord) { const int MAXLINE = 16; - char local[MAXLINE]; + char local[MAXLINE] = {0}; for (int j; nitems > 0; nitems--) { diff --git a/Src/Base/AMReX_FilCC_1D_C.H b/Src/Base/AMReX_FilCC_1D_C.H index 77e12be9172..189debc6d78 100644 --- a/Src/Base/AMReX_FilCC_1D_C.H +++ b/Src/Base/AMReX_FilCC_1D_C.H @@ -12,9 +12,9 @@ inline void filcc_cell (const IntVect& iv, Array4 const& q, const int dcomp, const int numcomp, - GeometryData const& geom, const Real time, + GeometryData const& geom, const Real /*time*/, const BCRec* bcr, const int bcomp, - const int orig_comp) noexcept + const int /*orig_comp*/) noexcept { const int i = iv[0]; diff --git a/Src/Base/AMReX_FilCC_2D_C.H b/Src/Base/AMReX_FilCC_2D_C.H index 3a7508283b9..5aab33f7fc5 100644 --- a/Src/Base/AMReX_FilCC_2D_C.H +++ b/Src/Base/AMReX_FilCC_2D_C.H @@ -12,9 +12,9 @@ inline void filcc_cell (const IntVect& iv, Array4 const& q, const int dcomp, const int numcomp, - GeometryData const& geom, const Real time, + GeometryData const& geom, const Real /*time*/, const BCRec* bcr, const int bcomp, - const int orig_comp) noexcept + const int /*orig_comp*/) noexcept { const int i = iv[0]; const int j = iv[1]; diff --git a/Src/Base/AMReX_FilCC_3D_C.H b/Src/Base/AMReX_FilCC_3D_C.H index 942d07f7b4d..8a837fbf7f1 100644 --- a/Src/Base/AMReX_FilCC_3D_C.H +++ b/Src/Base/AMReX_FilCC_3D_C.H @@ -12,9 +12,9 @@ inline void filcc_cell (const IntVect& iv, Array4 const& q, const int dcomp, const int numcomp, - GeometryData const& geom, const Real time, + GeometryData const& geom, const Real /*time*/, const BCRec* bcr, const int bcomp, - const int orig_comp) noexcept + const int /*orig_comp*/) noexcept { const int i = iv[0]; const int j = iv[1]; diff --git a/Src/Base/AMReX_FilCC_C.cpp b/Src/Base/AMReX_FilCC_C.cpp index 931915b0bc6..bde017a0485 100644 --- a/Src/Base/AMReX_FilCC_C.cpp +++ b/Src/Base/AMReX_FilCC_C.cpp @@ -3,7 +3,7 @@ namespace amrex { void fab_filcc (Box const& bx, Array4 const& qn, int ncomp, - Box const& domain, Real const* dx, Real const* xlo, + Box const& domain, Real const* /*dx*/, Real const* /*xlo*/, BCRec const* bcn) { const auto lo = amrex::lbound(bx); diff --git a/Src/Base/AMReX_FilND_C.H b/Src/Base/AMReX_FilND_C.H new file mode 100644 index 00000000000..01dd9a2479c --- /dev/null +++ b/Src/Base/AMReX_FilND_C.H @@ -0,0 +1,13 @@ +#ifndef AMREX_FILL_ND_H_ +#define AMREX_FILL_ND_H_ + +#include +#include + +namespace amrex { + void fab_filnd (Box const& bx, Array4 const& q, int ncomp, + Box const& domain, Real const* dx, Real const* xlo, + BCRec const* bc); +} + +#endif diff --git a/Src/Base/AMReX_FilND_C.cpp b/Src/Base/AMReX_FilND_C.cpp new file mode 100644 index 00000000000..610f4815f99 --- /dev/null +++ b/Src/Base/AMReX_FilND_C.cpp @@ -0,0 +1,100 @@ +#include + +namespace amrex { + +void fab_filnd (Box const& bx, Array4 const& qn, int ncomp, + Box const& domain, Real const* /*dx*/, Real const* /*xlo*/, + BCRec const* bcn) +{ + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + const auto domlo = amrex::lbound(domain); + const auto domhi = amrex::ubound(domain); + + const int ilo = domlo.x; + const int ihi = domhi.x; + +#if AMREX_SPACEDIM >= 2 + const int jlo = domlo.y; + const int jhi = domhi.y; +#endif + +#if AMREX_SPACEDIM == 3 + const int klo = domlo.z; + const int khi = domhi.z; +#endif + + for (int n = 0; n < ncomp; ++n) + { + Array4 q(qn,n); + BCRec const& bc = bcn[n]; + + if (lo.x < ilo && (bc.lo(0) != BCType::int_dir)) { + const int imin = lo.x; + const int imax = ilo-1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = imin; i <= imax; ++i) { + q(i,j,k) = q(ilo,j,k); + }}} + } + + if (hi.x > ihi && (bc.hi(0) != BCType::int_dir)) { + const int imin = ihi+1; + const int imax = hi.x; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = imin; i <= imax; ++i) { + q(i,j,k) = q(ihi,j,k); + }}} + } + +#if AMREX_SPACEDIM >= 2 + + if (lo.y < jlo && (bc.lo(1) != BCType::int_dir)) { + const int jmin = lo.y; + const int jmax = jlo-1; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = jmin; j <= jmax; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + q(i,j,k) = q(i,jlo,k); + }}} + } + + if (hi.y > jhi && (bc.hi(1) != BCType::int_dir)) { + const int jmin = jhi+1; + const int jmax = hi.y; + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = jmin; j <= jmax; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + q(i,j,k) = q(i,jhi,k); + }}} + } +#endif + +#if AMREX_SPACEDIM == 3 + + if (lo.z < klo && (bc.lo(2) != BCType::int_dir)) { + const int kmin = lo.z; + const int kmax = klo-1; + for (int k = kmin; k <= kmax; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + q(i,j,k) = q(i,j,klo); + }}} + } + + if (hi.z > khi && (bc.hi(2) != BCType::int_dir)) { + const int kmin = khi+1; + const int kmax = hi.z; + for (int k = kmin; k <= kmax; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + for (int i = lo.x; i <= hi.x; ++i) { + q(i,j,k) = q(i,j,khi); + }}} + } +#endif + } +} + +} diff --git a/Src/Base/AMReX_FileSystem.H b/Src/Base/AMReX_FileSystem.H new file mode 100644 index 00000000000..3a8ccc28a72 --- /dev/null +++ b/Src/Base/AMReX_FileSystem.H @@ -0,0 +1,32 @@ +#ifndef AMREX_FILE_SYSTEM_H_ +#define AMREX_FILE_SYSTEM_H_ + +#include + +#ifdef _WIN32 +typedef unsigned short mode_t; +#else +#include // for mode_t +#endif + +namespace amrex { +namespace FileSystem { + +bool +CreateDirectories (std::string const& filename, mode_t mode, bool verbose = false); + +std::string +CurrentPath (); + +bool +Exists (std::string const& filename); + +bool +Remove (std::string const& filename); + +bool +RemoveAll (std::string const& p); // recursive remove + +}} + +#endif diff --git a/Src/Base/AMReX_FileSystem.cpp b/Src/Base/AMReX_FileSystem.cpp new file mode 100644 index 00000000000..372570beecf --- /dev/null +++ b/Src/Base/AMReX_FileSystem.cpp @@ -0,0 +1,215 @@ +#include +#include +#include +#include + +#if defined(_WIN32) // || __cplusplus >= 201703L + +#include +#include + +namespace amrex { +namespace FileSystem { + +bool +CreateDirectories (std::string const& p, mode_t /*mode*/, bool verbose) +{ + std::error_code ec; + std::filesystem::create_directories(std::filesystem::path{p}, ec); + if (ec and verbose) { + amrex::AllPrint() << "amrex::UtilCreateDirectory failed to create " + << p << ": " << ec.message() << std::endl; + } + return !ec; +} + +bool +Exists (std::string const& filename) +{ + std::error_code ec; + bool r = std::filesystem::exists(std::filesystem::path{filename}, ec); + if (ec and amrex::Verbose() > 0) { + amrex::AllPrint() << "amrex::FileSystem::Exists failed. " << ec.message() << std::endl; + } + return r; +} + +std::string +CurrentPath () +{ + std::error_code ec; + auto path = std::filesystem::current_path(ec); + if (ec and amrex::Verbose() > 0) { + amrex::AllPrint() << "amrex::FileSystem::CurrentPath failed. " << ec.message() << std::endl; + } + return path.string(); +} + +bool +Remove (std::string const& filename) +{ + std::error_code ec; + bool r = std::filesystem::remove(std::filesystem::path{filename},ec); + return !ec; +} + +bool +RemoveAll (std::string const& p) +{ + std::error_code ec; + std::filesystem::remove_all(std::filesystem::path{p},ec); + return !ec; +} + +}} + +#else + +#include +#include +#include +#include +#include +#include +#include + +namespace amrex { +namespace FileSystem { + +bool +CreateDirectories (std::string const& path, mode_t mode, bool verbose) +{ + bool retVal(false); + Vector > pathError; + + const char* path_sep_str = "/"; + + if (path.length() == 0 || path == path_sep_str) { + return true; + } + + errno = 0; + + if(std::strchr(path.c_str(), *path_sep_str) == 0) { + // + // No slashes in the path. + // + errno = 0; + if(mkdir(path.c_str(), mode) < 0 && errno != EEXIST) { + retVal = false; + } else { + retVal = true; + } + pathError.push_back(std::make_pair(path, errno)); + } else { + // + // Make copy of the directory pathname so we can write to it. + // + char *dir = new char[path.length() + 1]; + (void) strcpy(dir, path.c_str()); + + char *slash = std::strchr(dir, *path_sep_str); + + if(dir[0] == *path_sep_str) { // full pathname. + do { + if(*(slash+1) == 0) { + break; + } + if((slash = std::strchr(slash+1, *path_sep_str)) != 0) { + *slash = 0; + } + errno = 0; + if(mkdir(dir, mode) < 0 && errno != EEXIST) { + retVal = false; + } else { + retVal = true; + } + pathError.push_back(std::make_pair(dir, errno)); + if(slash) { + *slash = *path_sep_str; + } + } while(slash); + + } else { // relative pathname. + + do { + *slash = 0; + errno = 0; + if(mkdir(dir, mode) < 0 && errno != EEXIST) { + retVal = false; + } else { + retVal = true; + } + pathError.push_back(std::make_pair(dir, errno)); + *slash = *path_sep_str; + } while((slash = std::strchr(slash+1, *path_sep_str)) != 0); + + errno = 0; + if(mkdir(dir, mode) < 0 && errno != EEXIST) { + retVal = false; + } else { + retVal = true; + } + pathError.push_back(std::make_pair(dir, errno)); + } + + delete [] dir; + } + + if(retVal == false || verbose == true) { + for(int i(0); i < pathError.size(); ++i) { + amrex::AllPrint()<< "amrex::UtilCreateDirectory:: path errno: " + << pathError[i].first << " :: " + << strerror(pathError[i].second) + << std::endl; + } + } + + return retVal; +} + +bool +Exists (std::string const& filename) +{ + struct stat statbuff; + return (lstat(filename.c_str(), &statbuff) != -1); +} + +std::string +CurrentPath () +{ + constexpr int bufSize = 1024; + char temp[bufSize]; + char *rCheck = getcwd(temp, bufSize); + if(rCheck == 0) { + amrex::Abort("**** Error: getcwd buffer too small."); + } + return std::string(rCheck); +} + +bool +Remove (std::string const& filename) +{ + return unlink(filename.c_str()); +} + +bool +RemoveAll (std::string const& p) +{ + if (p.size() >= 1990) { + amrex::Error("FileSystem::RemoveAll: Path name too long"); + return false; + } + char command[2000]; + std::snprintf(command, 2000, "\\rm -rf %s", p.c_str());; + int retVal = std::system(command); + if (retVal == -1 || WEXITSTATUS(retVal) != 0) { + amrex::Error("Removing old directory failed."); + return false; + } + return true; +} + +}} + +#endif diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H index a0732539e49..c76f5995058 100644 --- a/Src/Base/AMReX_Geometry.H +++ b/Src/Base/AMReX_Geometry.H @@ -50,7 +50,7 @@ struct GeometryData //! Returns our rectangular domain. AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const Box& Domain () const noexcept { return domain; } - //! Returns whether the domain is periodic in the given direction. +//! Returns whether the domain is periodic in the given direction. AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int isPeriodic (const int i) const noexcept { return is_periodic[i]; } //! Coordinates type @@ -104,11 +104,13 @@ public: void define (const Box& dom, const RealBox& rb, int coord, Array const& is_per) noexcept; //! Returns the problem domain. const RealBox& ProbDomain () const noexcept { return prob_domain; } + //! Returns the roundoff domain. + const RealBox& RoundoffDomain () const noexcept { return roundoff_domain; } //! Sets the problem domain. void ProbDomain (const RealBox& rb) noexcept { prob_domain = rb; - SetOffset(rb.lo()); + computeRoundoffDomain(); } //! Returns the lo end of the problem domain in each dimension. const Real* ProbLo () const noexcept { return prob_domain.lo(); } @@ -120,11 +122,11 @@ public: Real ProbHi (int dir) const noexcept { return prob_domain.hi(dir); } GpuArray ProbLoArray () const noexcept { - return {AMREX_D_DECL(prob_domain.lo(0),prob_domain.lo(1),prob_domain.lo(2))}; + return {{AMREX_D_DECL(prob_domain.lo(0),prob_domain.lo(1),prob_domain.lo(2))}}; } GpuArray ProbHiArray () const noexcept { - return {AMREX_D_DECL(prob_domain.hi(0),prob_domain.hi(1),prob_domain.hi(2))}; + return {{AMREX_D_DECL(prob_domain.hi(0),prob_domain.hi(1),prob_domain.hi(2))}}; } //! Returns the overall size of the domain by multiplying the ProbLength's together @@ -137,7 +139,11 @@ public: //! Returns our rectangular domain. const Box& Domain () const noexcept { return domain; } //! Sets our rectangular domain. - void Domain (const Box& bx) noexcept { domain = bx; } + void Domain (const Box& bx) noexcept + { + domain = bx; + computeRoundoffDomain(); + } //! Define a multifab of areas and volumes with given grow factor. void GetVolume (MultiFab& vol, const BoxArray& grds, @@ -190,27 +196,14 @@ public: return AMREX_D_TERM(isPeriodic(0),&&isPeriodic(1),&&isPeriodic(2)); } Array isPeriodic () const noexcept { - return {AMREX_D_DECL(static_cast(is_periodic[0]), - static_cast(is_periodic[1]), - static_cast(is_periodic[2]))}; + return {{AMREX_D_DECL(static_cast(is_periodic[0]), + static_cast(is_periodic[1]), + static_cast(is_periodic[2]))}}; } GpuArray isPeriodicArray () const noexcept { -// HIP FIX HERE - Initialization List -#ifdef AMREX_USE_HIP - GpuArray arr; - for (int i=0; i(is_periodic[i]); } - return arr; -/* - return {(int[AMREX_SPACEDIM]){AMREX_D_DECL(static_cast(is_periodic[0]), - static_cast(is_periodic[1]), - static_cast(is_periodic[2]))}}; -*/ -#else - return {AMREX_D_DECL(static_cast(is_periodic[0]), - static_cast(is_periodic[1]), - static_cast(is_periodic[2]))}; -#endif + return {{AMREX_D_DECL(static_cast(is_periodic[0]), + static_cast(is_periodic[1]), + static_cast(is_periodic[2]))}}; } //! What's period in specified direction? int period (int dir) const noexcept { BL_ASSERT(is_periodic[dir]); return domain.length(dir); } @@ -250,9 +243,9 @@ public: //! Array setPeriodicity (Array const& period) noexcept { - Array r{AMREX_D_DECL(is_periodic[0], - is_periodic[1], - is_periodic[2])}; + Array r{{AMREX_D_DECL(is_periodic[0], + is_periodic[1], + is_periodic[2])}}; AMREX_D_TERM(is_periodic[0] = period[0];, is_periodic[1] = period[1];, is_periodic[2] = period[2];); @@ -275,12 +268,41 @@ public: } } + /** + * \brief Returns true if a point is outside the roundoff domain. + * All particles with positions inside the roundoff domain + * are sure to be mapped to cells inside the Domain() box. Note that + * the same need not be true for all points inside ProbDomain(). + */ + bool outsideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const; + + /** + * \brief Returns true if a point is inside the roundoff domain. + * All particles with positions inside the roundoff domain + * are sure to be mapped to cells inside the Domain() box. Note that + * the same need not be true for all points inside ProbDomain(). + */ + bool insideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const; + + /** + * \brief Compute the roundoff domain. Public because it contains an + * extended host / device lambda. + */ + void computeRoundoffDomain (); + private: void read_params (); // is_periodic and RealBox used to be static bool is_periodic[AMREX_SPACEDIM] = {AMREX_D_DECL(false,false,false)}; RealBox prob_domain; + + // Due to round-off errors, not all floating point numbers for which plo >= x < phi + // will map to a cell that is inside "domain". "roundoff_domain" stores a phi + // that is very close to that in prob_domain, and for which all floating point numbers + // inside it according to a naive inequality check will map to a cell inside domain. + RealBox roundoff_domain; + // Box domain; }; diff --git a/Src/Base/AMReX_Geometry.cpp b/Src/Base/AMReX_Geometry.cpp index eaaa84409b2..1ff6b4292c5 100644 --- a/Src/Base/AMReX_Geometry.cpp +++ b/Src/Base/AMReX_Geometry.cpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -9,9 +10,7 @@ #include #include -#ifdef _OPENMP -#include -#endif +#include namespace amrex { @@ -39,7 +38,7 @@ operator>> (std::istream& is, is >> c; IntVect is_per; is >> is_per; - g.setPeriodicity({AMREX_D_DECL(is_per[0],is_per[1],is_per[2])}); + g.setPeriodicity({{AMREX_D_DECL(is_per[0],is_per[1],is_per[2])}}); } else { g.setPeriodicity(DefaultGeometry().isPeriodic()); } @@ -104,12 +103,7 @@ Geometry::define (const Box& dom, const RealBox* rb, int coord, domain = dom; ok = true; - for (int k = 0; k < AMREX_SPACEDIM; k++) - { - offset[k] = prob_domain.lo(k); - dx[k] = prob_domain.length(k)/(Real(domain.length(k))); - inv_dx[k] = 1.0/dx[k]; - } + computeRoundoffDomain(); } void @@ -119,9 +113,7 @@ Geometry::Setup (const RealBox* rb, int coord, int const* isper) noexcept if (gg->ok) return; -#ifdef _OPENMP - BL_ASSERT(!omp_in_parallel()); -#endif + BL_ASSERT(!OpenMP::in_parallel()); ParmParse pp("geometry"); @@ -401,4 +393,62 @@ Geometry::growPeriodicDomain (int ngrow) const noexcept return b; } +void +Geometry::computeRoundoffDomain () +{ + for (int k = 0; k < AMREX_SPACEDIM; k++) + { + offset[k] = prob_domain.lo(k); + dx[k] = prob_domain.length(k)/(Real(domain.length(k))); + inv_dx[k] = 1.0/dx[k]; + } + + roundoff_domain = prob_domain; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) + { + int ilo = Domain().smallEnd(idim); + int ihi = Domain().bigEnd(idim); + Real plo = ProbLo(idim); + Real phi = ProbHi(idim); + Real idx = InvCellSize(idim); + Real deltax = CellSize(idim); + +#ifdef AMREX_SINGLE_PRECISION_PARTICLES + Real tolerance = std::max(1.e-4*deltax, 1.e-10*phi); +#else + Real tolerance = std::max(1.e-8*deltax, 1.e-14*phi); +#endif + // bisect the point at which the cell no longer maps to inside the domain + Real lo = static_cast(phi) - Real(0.5)*static_cast(deltax); + Real hi = static_cast(phi) + Real(0.5)*static_cast(deltax); + + Real mid = bisect(lo, hi, + [=] AMREX_GPU_HOST_DEVICE (Real x) -> Real + { + int i = int(Math::floor((x - plo)*idx)) + ilo; + bool inside = i >= ilo and i <= ihi; + return static_cast(inside) - Real(0.5); + }, tolerance); + roundoff_domain.setHi(idim, mid - tolerance); + } +} + +bool +Geometry::outsideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const +{ + bool outside = AMREX_D_TERM(x < roundoff_domain.lo(0) + || x >= roundoff_domain.hi(0), + || y < roundoff_domain.lo(1) + || y >= roundoff_domain.hi(1), + || z < roundoff_domain.lo(2) + || z >= roundoff_domain.hi(2)); + return outside; +} + +bool +Geometry::insideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const +{ + return !outsideRoundoffDomain(AMREX_D_DECL(x, y, z)); +} + } diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H index 3d7ff6f9ae1..5c59d89305b 100644 --- a/Src/Base/AMReX_Gpu.H +++ b/Src/Base/AMReX_Gpu.H @@ -9,10 +9,12 @@ namespace amrex { namespace Cuda {} } #endif #include +#include #include #include #include #include +#include #include #include diff --git a/Src/Base/AMReX_GpuAllocators.H b/Src/Base/AMReX_GpuAllocators.H index 53d65d088a0..99b01f30a37 100644 --- a/Src/Base/AMReX_GpuAllocators.H +++ b/Src/Base/AMReX_GpuAllocators.H @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -20,11 +21,17 @@ namespace amrex { template struct RunOnGpu : std::false_type {}; - -#ifdef AMREX_USE_GPU - + + struct ArenaAllocatorTraits { + typedef std::true_type propagate_on_container_copy_assignment; + typedef std::true_type propagate_on_container_move_assignment; + typedef std::true_type propagate_on_container_swap; + typedef std::true_type is_always_equal; + }; + template class ArenaAllocator + : public ArenaAllocatorTraits { public : @@ -33,18 +40,19 @@ namespace amrex { inline value_type* allocate(std::size_t n) { value_type* result = nullptr; - result = (value_type*) The_Arena()->alloc(n * sizeof(T)); + result = (value_type*) The_Arena()->alloc(n * sizeof(T)); return result; } - + inline void deallocate(value_type* ptr, std::size_t) { The_Arena()->free(ptr); - } + } }; template class DeviceArenaAllocator + : public ArenaAllocatorTraits { public : @@ -53,18 +61,19 @@ namespace amrex { inline value_type* allocate(std::size_t n) { value_type* result = nullptr; - result = (value_type*) The_Device_Arena()->alloc(n * sizeof(T)); + result = (value_type*) The_Device_Arena()->alloc(n * sizeof(T)); return result; } - + inline void deallocate(value_type* ptr, std::size_t) { The_Device_Arena()->free(ptr); - } + } }; template class PinnedArenaAllocator + : public ArenaAllocatorTraits { public : @@ -73,18 +82,19 @@ namespace amrex { inline value_type* allocate(std::size_t n) { value_type* result = nullptr; - result = (value_type*) The_Pinned_Arena()->alloc(n * sizeof(T)); + result = (value_type*) The_Pinned_Arena()->alloc(n * sizeof(T)); return result; } - + inline void deallocate(value_type* ptr, std::size_t) { The_Pinned_Arena()->free(ptr); - } + } }; template class ManagedArenaAllocator + : public ArenaAllocatorTraits { public : @@ -93,14 +103,14 @@ namespace amrex { inline value_type* allocate(std::size_t n) { value_type* result = nullptr; - result = (value_type*) The_Managed_Arena()->alloc(n * sizeof(T)); + result = (value_type*) The_Managed_Arena()->alloc(n * sizeof(T)); return result; } - + inline void deallocate(value_type* ptr, std::size_t) { The_Managed_Arena()->free(ptr); - } + } }; template @@ -110,10 +120,12 @@ namespace amrex { using value_type = T; + PolymorphicAllocator () : m_use_gpu_aware_mpi(ParallelDescriptor::UseGpuAwareMpi()) {} + inline value_type* allocate(std::size_t n) { value_type* result = nullptr; - if (ParallelDescriptor::UseGpuAwareMpi()) + if (m_use_gpu_aware_mpi) { result = (value_type*) The_Device_Arena()->alloc(n * sizeof(T)); } @@ -123,10 +135,10 @@ namespace amrex { } return result; } - + inline void deallocate(value_type* ptr, std::size_t) { - if (ParallelDescriptor::UseGpuAwareMpi()) + if (m_use_gpu_aware_mpi) { The_Device_Arena()->free(ptr); } @@ -134,23 +146,27 @@ namespace amrex { { The_Pinned_Arena()->free(ptr); } - } - }; + } - template