Skip to content

Commit

Permalink
radiation plugin: add new option
Browse files Browse the repository at this point in the history
Add runtime option `numTmpResults` to increase independent work on a
device.
If set to one the code will behaves as before this PR.
The default will is two to utilize modern GPU devices with a
typicaly configuration.
  • Loading branch information
psychocoderHPC committed Sep 22, 2020
1 parent 0b0e37a commit 315bd9a
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 11 deletions.
3 changes: 2 additions & 1 deletion docs/TBG_macros.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ TBG_stopWindow="--stopWindow 1337"
#--<species>_radiation.end Time step to stop calculating the radiation
#--<species>_radiation.radPerGPU If flag is set, each GPU stores its own spectra without summing the entire simulation area
#--<species>_radiation.folderRadPerGPU Folder where the GPU specific spectras are stored
#--e_<species>_radiation.compression If flag is set, the hdf5 output will be compressed.
#--<species>_radiation.compression If flag is set, the hdf5 output will be compressed.
#--<species>_radiation.numJobs Number of independent jobs used for the radiation calculation.
TBG_radiation="--<species>_radiation.period 1 --<species>_radiation.dump 2 --<species>_radiation.totalRadiation \
--<species>_radiation.lastRadiation --<species>_radiation.start 2800 --<species>_radiation.end 3000"

Expand Down
8 changes: 7 additions & 1 deletion docs/source/usage/plugins/radiation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,11 @@ Command line option Description
``--<species>_radiation.folderRadPerGPU`` Name of the folder, where the GPU specific spectra are stored.
Default: ``radPerGPU``
``--<species>_radiation.compression`` If set, the hdf5 output is compressed.
``--<species>_radiation.numJobs`` Number of independent jobs used for the radiation calculation.
This option is used to increase the utilization of the device by producing more independent work.
This option enables accumulation of data in parallel into multiple temporary arrays, thereby increasing the utilization of
the device by increasing the memory footprint
Default: ``2``
========================================= ==============================================================================================================================

Memory Complexity
Expand All @@ -295,7 +300,8 @@ Memory Complexity
Accelerator
"""""""""""

each energy bin times each coordinate bin allocates one counter (``float_X``) permanently and on each accelerator.
locally, ``numJobs`` times number of frequencies ``N_omega`` times number of directions ``N_theta`` is permanently allocated.
Each result element (amplitude) is a double precision complex number.

Host
""""
Expand Down
33 changes: 27 additions & 6 deletions include/picongpu/plugins/radiation/Radiation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ namespace idLabels
}// end namespace idLabels



///////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////// Radiation Plugin Class ////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////
Expand All @@ -97,8 +96,10 @@ class Radiation : public ISimulationPlugin
* frequency. Layout of the radiation array is:
* [omega_1(theta_1),omega_2(theta_1),...,omega_N-omega(theta_1),
* omega_1(theta_2),omega_2(theta_2),...,omega_N-omega(theta_N-theta)]
* The second dimension is used to store intermediate results if command
* line option numJobs is > 1.
*/
GridBuffer<Amplitude, DIM1> *radiation;
GridBuffer<Amplitude, 2> *radiation;
radiation_frequencies::InitFreqFunctor freqInit;
radiation_frequencies::FreqFunctor freqFkt;

Expand All @@ -119,6 +120,7 @@ class Radiation : public ISimulationPlugin
bool radPerGPU;
std::string folderRadPerGPU;
DataSpace<simDim> lastGPUpos;
int numJobs;

/**
* Data structure for storage and summation of the intermediate values of
Expand Down Expand Up @@ -214,7 +216,8 @@ class Radiation : public ISimulationPlugin
((pluginPrefix + ".end").c_str(), po::value<uint32_t > (&radEnd)->default_value(0), "time index when radiation should end with calculation")
((pluginPrefix + ".radPerGPU").c_str(), po::bool_switch(&radPerGPU), "enable radiation output from each GPU individually")
((pluginPrefix + ".folderRadPerGPU").c_str(), po::value<std::string > (&folderRadPerGPU)->default_value("radPerGPU"), "folder in which the radiation of each GPU is written")
((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output");
((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output")
((pluginPrefix + ".numJobs").c_str(), po::value<int > (&numJobs)->default_value(2), "Number of independent jobs used for the radiation calculation.");
}


Expand Down Expand Up @@ -282,13 +285,22 @@ class Radiation : public ISimulationPlugin
{
if(!notifyPeriod.empty())
{
if(numJobs <= 0)
{
std::cerr << "'numJobs' must be '>=1' value is adjusted from" << numJobs << " to '1'." << std::endl;
numJobs = 1;
}
// allocate memory for all amplitudes for temporal data collection
tmp_result = new Amplitude[elements_amplitude()];

/*only rank 0 create a file*/
isMaster = reduce.hasResult(mpi::reduceMethods::Reduce());

radiation = new GridBuffer<Amplitude, DIM1 > (DataSpace<DIM1 > (elements_amplitude())); //create one int on GPU and host
/* Buffer for GPU results.
* The second dimension is used to store intermediate results if command
* line option numJobs is > 1.
*/
radiation = new GridBuffer<Amplitude, 2> (DataSpace<2>(elements_amplitude(), numJobs));

freqInit.Init(frequencies_from_list::listLocation);
freqFkt = freqInit.getFunctor();
Expand Down Expand Up @@ -387,6 +399,15 @@ class Radiation : public ISimulationPlugin
{
radiation->deviceToHost();
__getTransactionEvent().waitForFinished();

auto dbox = radiation->getHostBuffer().getDataBox();
int numAmp = elements_amplitude();
// update the main result matrix (y index zero)
for( int resultIdx = 1; resultIdx < numJobs; ++resultIdx )
for( int ampIdx = 0; ampIdx < numAmp; ++ampIdx )
{
dbox(DataSpace< 2 >( ampIdx, 0 ) ) += dbox(DataSpace< 2 >( ampIdx, resultIdx ) );
}
}


Expand Down Expand Up @@ -1188,8 +1209,8 @@ class Radiation : public ISimulationPlugin
PMACC_KERNEL( KernelRadiationParticles<
numWorkers
>{} )(
gridDim_rad,
numWorkers
DataSpace< 2 >(gridDim_rad, numJobs),
DataSpace< 2 >(numWorkers,1)
)(
/*Pointer to particles memory on the device*/
particles->getDeviceParticlesBox(),
Expand Down
8 changes: 5 additions & 3 deletions include/picongpu/plugins/radiation/Radiation.kernel
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,13 @@ namespace radiation
// get absolute number of relevant super cells
int const numSuperCells = superCellsCount.productOfComponents();

int const numJobs = cupla::gridDim(acc).y;
int const jobIdx = cupla::blockIdx(acc).y;

/* go over all super cells on GPU
/* go over all super cells on GPU with a stride depending on number of temporary results
* but ignore all guarding supercells
*/
for( int super_cell_index = 0; super_cell_index <= numSuperCells; ++super_cell_index )
for( int super_cell_index = jobIdx; super_cell_index <= numSuperCells; super_cell_index += numJobs )
{
// select SuperCell and add one sided guard again
DataSpace< simDim > const superCell =
Expand Down Expand Up @@ -481,7 +483,7 @@ namespace radiation
* - from this (one) time step
* - omega_id = theta_idx * radiation_frequencies::N_omega + o
*/
radiation[ theta_idx * radiation_frequencies::N_omega + o] += amplitude;
radiation( DataSpace< 2 >(theta_idx * radiation_frequencies::N_omega + o, jobIdx ) ) += amplitude;

} // end frequency loop

Expand Down

0 comments on commit 315bd9a

Please sign in to comment.