diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dbfbde67ca..1c459b16fe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,27 @@ Change Log / Release Log for mallocMC ================================================================ +2.1.0crp +------------- +**Date:** 2015-02-11 + +This release fixes some bugs that occured after the release of 2.0.1crp and reduces the interface to improve interoperability with the default CUDA allocator. +We closed all issues documented in +[Milestone *New Features*](https://github.com/ComputationalRadiationPhysics/mallocMC/issues?milestone=3&state=closed) + +### Changes to mallocMC 2.0.1crp + +**Features** + - the possibility to overwrite the default implementation of new/delete and malloc/free was removed #72. **This changes the interface**, since users are now always forced to call `mallocMC::malloc()` and `mallocMC::free()`. This is intended to improve readability and allows to use the CUDA allocator inside mallocMC. + - the policy *Scatter* now places the onpagetables data structure at the end of a page. This can greatly improve performance when using large pages and `resetfreedpages=true` #80 + +**Bug fixes** + - in the policy *Scatter*, `fullsegments` and `additional_chunks` could grow too large in certain configurations #79 + +**Misc:** + - See the full changes at https://github.com/ComputationalRadiationPhysics/mallocMC/compare/2.0.1crp...2.1.0crp + + 2.0.1crp ------------- **Date:** 2015-01-13 diff --git a/LICENSE b/LICENSE index e39825eb2b7..bfb94154545 100644 --- a/LICENSE +++ b/LICENSE @@ -9,8 +9,8 @@ Copyright (C) 2012 Institute for Computer Graphics and Vision, Graz University of Technology - Copyright (C) 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf + Copyright (C) 2014-2015 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at Bernhard Kainz - kainz ( at ) icg.tugraz.at diff --git a/Usage.md b/Usage.md index f9020bbdbf1..e8f9d1fb6e3 100644 --- a/Usage.md +++ b/Usage.md @@ -94,27 +94,18 @@ To create a default instance of the ScatterAllocator type and add the necessary functions, the following Macro has to be executed: ```c++ -POLICYMALLOC_SET_ALLOCATOR_TYPE(ScatterAllocator) +MALLOCMC_SET_ALLOCATOR_TYPE(ScatterAllocator) ``` This will set up the following functions in the namespace `mallocMC`: | Name | description | |-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| InitHeap() | Initializes the heap. Must be called before any other calls to the allocator are permitted. Can take the desired size of the heap as a parameter | -| finalizeHeap() | Destroys the heap again | -| pbMalloc() / malloc() | Allocates memory on the accelerator | -| pbFree() / free() | Frees memory on the accelerator | -| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chose CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | - -If the policy class `OldMalloc` is **not** used, it is also possible to execute -the Macro -```c++ -POLICYMALLOC_OVERWRITE_MALLOC() -``` - -which will overwrite the global functions `malloc()`/`free()` on the accelerator -(for NVIDIA CUDA accelerators, this will also replace calls to `new` and `delete`). +| mallocMC::initHeap() | Initializes the heap. Must be called before any other calls to the allocator are permitted. Can take the desired size of the heap as a parameter | +| mallocMC::finalizeHeap() | Destroys the heap again | +| mallocMC::malloc() | Allocates memory on the accelerator | +| mallocMC::free() | Frees memory on the accelerator | +| mallocMC::getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | Step 4: use dynamic memory allocation diff --git a/examples/mallocMC_example01.cu b/examples/mallocMC_example01.cu index 7cf5d3dce64..2ea4413676b 100644 --- a/examples/mallocMC_example01.cu +++ b/examples/mallocMC_example01.cu @@ -105,7 +105,7 @@ void run() size_t block = 32; size_t grid = 32; int length = 100; - assert(length<= block*grid); //necessary for used algorithm + assert((unsigned)length<= block*grid); //necessary for used algorithm //init the heap std::cerr << "initHeap..."; diff --git a/examples/mallocMC_example02.cu b/examples/mallocMC_example02.cu index 167c1bed31b..58b971a6b18 100644 --- a/examples/mallocMC_example02.cu +++ b/examples/mallocMC_example02.cu @@ -92,9 +92,6 @@ typedef mallocMC::Allocator< // use "ScatterAllocator" as mallocMC MALLOCMC_SET_ALLOCATOR_TYPE(ScatterAllocator) -// replace all standard malloc()-calls on the device by mallocMC calls -// This will not work with the CreationPolicy "OldMalloc"! -MALLOCMC_OVERWRITE_MALLOC() /////////////////////////////////////////////////////////////////////////////// // End of mallocMC configuration @@ -108,7 +105,7 @@ int main() cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); - if( deviceProp.major < 2 ) { + if( deviceProp.major < int(2) ) { std::cerr << "Error: Compute Capability >= 2.0 required. (is "; std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl; return 1; @@ -128,20 +125,18 @@ __device__ int** c; __global__ void createArrays(int x, int y){ - a = (int**) malloc(sizeof(int*) * x*y); - b = (int**) malloc(sizeof(int*) * x*y); - c = (int**) malloc(sizeof(int*) * x*y); + a = (int**) mallocMC::malloc(sizeof(int*) * x*y); + b = (int**) mallocMC::malloc(sizeof(int*) * x*y); + c = (int**) mallocMC::malloc(sizeof(int*) * x*y); } __global__ void fillArrays(int length, int* d){ int id = threadIdx.x + blockIdx.x*blockDim.x; - // using the MALLOCMC_OVERWRITE_MALLOC() macro - // allows also the use of "new" - a[id] = new int[length]; - b[id] = new int[length]; - c[id] = new int[length]; + a[id] = (int*) mallocMC::malloc(sizeof(int)*length); + b[id] = (int*) mallocMC::malloc(sizeof(int)*length); + c[id] = (int*) mallocMC::malloc(sizeof(int)*length); for(int i=0 ; i HierarchyThreshold ? 0 : (pagesize + (minSegmentSize-1)) / minSegmentSize; + static const uint32 maxOnPageMasks = 32 > tmp_maxOPM ? tmp_maxOPM : 32; #ifndef MALLOCMC_CP_SCATTER_HASHINGK #define MALLOCMC_CP_SCATTER_HASHINGK static_cast(HashingProperties::hashingK::value) @@ -190,8 +193,7 @@ namespace ScatterKernelDetail{ __device__ void init() { //clear the entire data which can hold bitfields - uint32 first_possible_metadata = 32*HierarchyThreshold; - uint32* write = (uint32*)(data+(pagesize-first_possible_metadata)); + uint32* write = (uint32*)(data + pagesize - (int)(sizeof(uint32)*maxOnPageMasks)); while(write < (uint32*)(data + pagesize)) *write++ = 0; } @@ -239,6 +241,17 @@ namespace ScatterKernelDetail{ return (spot + step) % spots; } + + /** + * onPageMasksPosition returns a pointer to the beginning of the onpagemasks inside a page. + * @param page the page that holds the masks + * @param the number of hierarchical page tables (bitfields) that are used inside this mask. + * @return pointer to the first address inside the page that holds metadata bitfields. + */ + __device__ inline uint32* onPageMasksPosition(uint32 page, uint32 nMasks){ + return (uint32*)(_page[page].data + pagesize - (int)sizeof(uint32)*nMasks); + } + /** * usespot marks finds one free spot in the bitfield, marks it and returns its offset * @param bitfield pointer to the bitfield to use @@ -263,6 +276,25 @@ namespace ScatterKernelDetail{ } } + + /** + * calcAdditionalChunks determines the number of chunks that are contained in the last segment of a hierarchical page + * + * The additional checks are necessary to ensure correct results for very large pages and small chunksizes + * + * @param fullsegments the number of segments that can be completely filled in a page. This may NEVER be bigger than 32! + * @param segmentsize the number of bytes that are contained in a completely filled segment (32 chunks) + * @param chunksize the chosen allocation size within the page + * @return the number of additional chunks that will not fit in one of the fullsegments. For any correct input, this number is smaller than 32 + */ + __device__ inline uint32 calcAdditionalChunks(uint32 fullsegments, uint32 segmentsize, uint32 chunksize){ + if(fullsegments != 32){ + return max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize; + }else + return 0; + } + + /** * addChunkHierarchy finds a free chunk on a page which uses bit fields on the page * @param chunksize the chunksize of the page @@ -279,7 +311,7 @@ namespace ScatterKernelDetail{ if((mask & (1 << spot)) != 0) spot = nextspot(mask, spot, segments); uint32 tries = segments - __popc(mask); - uint32* onpagemasks = (uint32*)(_page[page].data + chunksize*(fullsegments*32 + additional_chunks)); + uint32* onpagemasks = onPageMasksPosition(page,segments); for(uint32 i = 0; i < tries; ++i) { int hspot = usespot(onpagemasks + spot, spot < fullsegments ? 32 : additional_chunks); @@ -327,10 +359,8 @@ namespace ScatterKernelDetail{ { //more chunks than can be covered by the pte's single bitfield can be used uint32 segmentsize = chunksize*32 + sizeof(uint32); - uint32 fullsegments = 0; - uint32 additional_chunks = 0; - fullsegments = pagesize / segmentsize; - additional_chunks = max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize; + uint32 fullsegments = min(32,pagesize / segmentsize); + uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize); if(filllevel < fullsegments * 32 + additional_chunks) chunk_ptr = addChunkHierarchy(chunksize, fullsegments, additional_chunks, page); } @@ -437,12 +467,13 @@ namespace ScatterKernelDetail{ { //one more level in hierarchy uint32 segmentsize = chunksize*32 + sizeof(uint32); - uint32 fullsegments = pagesize / segmentsize; - uint32 additional_chunks = max(0,(int)(pagesize - fullsegments*segmentsize) - (int)sizeof(uint32))/chunksize; + uint32 fullsegments = min(32,pagesize / segmentsize); + uint32 additional_chunks = calcAdditionalChunks(fullsegments,segmentsize,chunksize); uint32 segment = inpage_offset / (chunksize*32); uint32 withinsegment = (inpage_offset - segment*(chunksize*32))/chunksize; //mark it as free - uint32* onpagemasks = (uint32*)(_page[page].data + chunksize*(fullsegments*32 + additional_chunks)); + uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32* onpagemasks = onPageMasksPosition(page,nMasks); uint32 old = atomicAnd(onpagemasks + segment, ~(1 << withinsegment)); // always do this, since it might fail due to a race-condition with addChunkHierarchy @@ -819,8 +850,8 @@ namespace ScatterKernelDetail{ if(chunksize <= HierarchyThreshold) { uint32 segmentsize = chunksize*32 + sizeof(uint32); //each segment can hold 32 2nd-level chunks - uint32 fullsegments = pagesize / segmentsize; //there might be space for more than 32 segments with 32 2nd-level chunks - uint32 additional_chunks = max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize; + uint32 fullsegments = min(32,pagesize / segmentsize); //there might be space for more than 32 segments with 32 2nd-level chunks + uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize); uint32 level2Chunks = fullsegments * 32 + additional_chunks; return level2Chunks - filledChunks; }else{ diff --git a/src/include/mallocMC/mallocMC_overwrites.hpp b/src/include/mallocMC/mallocMC_overwrites.hpp index 97c2c52adc8..24a71e9c730 100644 --- a/src/include/mallocMC/mallocMC_overwrites.hpp +++ b/src/include/mallocMC/mallocMC_overwrites.hpp @@ -82,28 +82,6 @@ bool providesAvailableSlots(){ \ } /* end namespace mallocMC */ - -/** Create the functions mallocMC() and mcfree() inside a namespace - * - * This allows to use a function without bothering with name-clashes when - * including a namespace in the global scope. It will call the namespaced - * version of malloc() inside. - */ -#define MALLOCMC_MALLOCMC() \ -namespace mallocMC{ \ -MAMC_ACCELERATOR \ -void* mallocMC(size_t t) __THROW \ -{ \ - return mallocMC::malloc(t); \ -} \ -MAMC_ACCELERATOR \ -void mcfree(void* p) __THROW \ -{ \ - mallocMC::free(p); \ -} \ -} /* end namespace mallocMC */ - - /** Create the functions malloc() and free() inside a namespace * * This allows for a peaceful coexistence between different functions called @@ -126,87 +104,13 @@ void free(void* p) __THROW \ -/** Override/replace the global implementation of placement new/delete on CUDA - * - * These overrides are for device-side new and delete and need a pointer to the - * memory-allocator object on device (this will be mostly useful when using - * more advanced techniques and managing your own global object instead of - * using the provided macros). - * - * @param h the allocator as returned by initHeap() - */ -#ifdef __CUDACC__ -#if __CUDA_ARCH__ >= 200 -#define MALLOCMC_OVERWRITE_NEW() \ -MAMC_ACCELERATOR \ -void* operator new(size_t t, mallocMC::mallocMCType &h) \ -{ \ - return h.alloc(t); \ -} \ -MAMC_ACCELERATOR \ -void* operator new[](size_t t, mallocMC::mallocMCType &h) \ -{ \ - return h.alloc(t); \ -} \ -MAMC_ACCELERATOR \ -void operator delete(void* p, mallocMC::mallocMCType &h) \ -{ \ - h.dealloc(p); \ -} \ -MAMC_ACCELERATOR \ -void operator delete[](void* p, mallocMC::mallocMCType &h) \ -{ \ - h.dealloc(p); \ -} -#endif -#endif - - - -/** Override/replace the global implementation of malloc/free on CUDA devices - * - * Attention: This will also replace "new", "new[]", "delete" and "delete[]", - * since CUDA uses the same malloc/free functions for that. Needs at least - * ComputeCapability 2.0 - */ -#ifdef __CUDACC__ -#if __CUDA_ARCH__ >= 200 -#define MALLOCMC_OVERWRITE_MALLOC() \ -MAMC_ACCELERATOR \ -void* malloc(size_t t) __THROW \ -{ \ - return mallocMC::malloc(t); \ -} \ -MAMC_ACCELERATOR \ -void free(void* p) __THROW \ -{ \ - mallocMC::free(p); \ -} -#endif -#endif - - - /* if the defines do not exist (wrong CUDA version etc), * create at least empty defines */ -#ifndef MALLOCMC_MALLOCMC -#define MALLOCMC_MALLOCMC() -#endif - #ifndef MALLOCMC_MALLOC #define MALLOCMC_MALLOC() #endif -#ifndef MALLOCMC_OVERWRITE_NEW -#define MALLOCMC_OVERWRITE_NEW() -#endif - -#ifndef MALLOCMC_OVERWRITE_MALLOC -#define MALLOCMC_OVERWRITE_MALLOC() -#endif - - /** Set up the global variables and functions * @@ -217,8 +121,4 @@ void free(void* p) __THROW \ #define MALLOCMC_SET_ALLOCATOR_TYPE(MALLOCMC_USER_DEFINED_TYPE) \ MALLOCMC_GLOBAL_FUNCTIONS(MALLOCMC_USER_DEFINED_TYPE) \ MALLOCMC_MALLOC() \ -MALLOCMC_MALLOCMC() \ MALLOCMC_AVAILABLESLOTS() - -//MALLOCMC_OVERWRITE_NEW() - diff --git a/src/include/mallocMC/version.hpp b/src/include/mallocMC/version.hpp index 1a7ad800d5b..cbbfe7b5b47 100644 --- a/src/include/mallocMC/version.hpp +++ b/src/include/mallocMC/version.hpp @@ -5,8 +5,8 @@ Copyright (C) 2012 Institute for Computer Graphics and Vision, Graz University of Technology - Copyright (C) 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf + Copyright (C) 2014-2015 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at Bernhard Kainz - kainz ( at ) icg.tugraz.at @@ -38,8 +38,8 @@ /** the mallocMC version: major API changes should be reflected here */ #define MALLOCMC_VERSION_MAJOR 2 -#define MALLOCMC_VERSION_MINOR 0 -#define MALLOCMC_VERSION_PATCH 1 +#define MALLOCMC_VERSION_MINOR 1 +#define MALLOCMC_VERSION_PATCH 0 /** the mallocMC flavor is used to differenciate the releases of the * Computational Radiation Physics group (crp) from other releases diff --git a/tests/verify_heap.cu b/tests/verify_heap.cu index 9101a7c9d19..97096eb7e09 100644 --- a/tests/verify_heap.cu +++ b/tests/verify_heap.cu @@ -50,9 +50,8 @@ #include "src/include/mallocMC/mallocMC_utils.hpp" #include "verify_heap_config.hpp" -//use ScatterAllocator to replace malloc/free +//use ScatterAllocator MALLOCMC_SET_ALLOCATOR_TYPE(ScatterAllocator) -MALLOCMC_OVERWRITE_MALLOC() // global variable for verbosity, might change due to user input '--verbose' bool verbose = false;