-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathState.h
219 lines (181 loc) · 6.43 KB
/
State.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#ifndef __STATE_H__
#define __STATE_H__
#include "ParallelGravity.h"
/// @brief Base class for maintaining the state of a tree walk.
class State {
public:
/// Set after our walk is finished, but we are still waiting for
/// combiner cache flushes to be processed.
int bWalkDonePending;
/// The bucket we have started to walk.
int currentBucket;
// shifted variable into state. there is an issue of redundancy
// here, though. in addition to local state, remote and remote-resume
// state also have this variable but have no use for it, since only
// a single copy is required.
// could have made this the third element in the array below
/// @brief Keep track of how many buckets are unfinished. XXX
/// note the misnomer.
int myNumParticlesPending;
// again, redundant variables, since only remote-no-resume
// walks use this variable to see how many chunks have
// been used
///
/// @brief Number of pending chunks.
///
/// The remote tree walk is divided into chunks for more parallelism.
/// A chunk is pending wrt a TreePiece until that TreePiece has
/// finished using it completely.
int numPendingChunks;
/// @brief counters to keep track of outstanding remote processor
// requests tied to each bucket (position 0) and chunk (position 1).
int *counterArrays[2];
virtual ~State() {}
};
#if INTERLIST_VER > 0
#if defined CUDA
#include "HostCUDA.h"
#include "DataManager.h"
#include "ck128bitHash.h"
class DoubleWalkState;
template<typename T>
class GenericList{
public:
CkVec<CkVec<T> > lists;
int totalNumInteractions;
GenericList() : totalNumInteractions(0) {}
void reset(){
// clear all bucket lists:
for(int i = 0; i < lists.length(); i++){
lists[i].length() = 0;
}
totalNumInteractions = 0;
}
void free(){
for(int i = 0; i < lists.length(); i++){
lists[i].free();
}
lists.free();
totalNumInteractions = 0;
}
void init(int numBuckets, int numper){
lists.resize(numBuckets);
for(int i = 0; i < numBuckets; i++){
lists[i].reserve(numper);
}
}
CudaRequest *serialize(TreePiece *tp);
void getBucketParameters(TreePiece *tp,
int bucket,
int &bucketStart, int &bucketSize){
//std::map<NodeKey, int>&lpref){
// bucket is listed in this offload
GenericTreeNode *bucketNode = tp->bucketList[bucket];
bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
bucketStart = bucketNode->bucketArrayIndex;
CkAssert(bucketStart >= 0);
}
void getActiveBucketParameters(TreePiece *tp,
int bucket,
int &bucketStart, int &bucketSize){
//std::map<NodeKey, int>&lpref){
// bucket is listed in this offload
GenericTreeNode *bucketNode = tp->bucketList[bucket];
BucketActiveInfo *binfo = &(tp->bucketActiveInfo[bucket]);
//bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
//bucketStart = bucketNode->bucketArrayIndex;
bucketSize = tp->bucketActiveInfo[bucket].size;
bucketStart = tp->bucketActiveInfo[bucket].start;
CkAssert(bucketStart >= 0);
}
void push_back(int b, T &ilc, DoubleWalkState *state, TreePiece *tp);
};
#endif
///
/// @brief Hold state where both the targets and sources are tree walked.
///
class DoubleWalkState : public State {
public:
/// Lists of cells to be checked for the opening criterion. One
/// list for each level in the tree.
CheckList *chklists;
/// Lists of cells which need to go to the next local level before
/// deciding if to open them.
UndecidedLists undlists;
/// Lists of cells to be computed. One list for each level.
CkVec<CkVec<OffsetNode> >clists;
/// Lists of local particles to be computed. One list for each level.
CkVec<CkVec<LocalPartInfo> >lplists;
/// Lists of remote particles to be computed. One list for each level.
CkVec<CkVec<RemotePartInfo> >rplists;
/// set once before the first TreePiece::calculateGravityRemote() is called for a chunk
/// the idea is to place the chunkRoot (along with replicas)
/// on the remote compute chklist only once per chunk
///
/// one for each chunk
bool *placedRoots;
// to tell a remote-resume state from a remote-no-resume state
bool resume;
#ifdef CUDA
int nodeThreshold;
int partThreshold;
GenericList<ILCell> nodeLists;
GenericList<ILPart> particleLists;
CkVec<CudaMultipoleMoments> *nodes;
CkVec<CompactPartData> *particles;
// during 'small' rungs, buckets are marked when
// they are included for computation in the request's
// aux. particle array. these markings should be
// cleared before the assembly of the next request is
// begun. for this purpose, we keep track of buckets
// marked during the construction of a request.
//
// NB: for large rungs, we don't mark buckets while
// compiling requests. for such rungs, since all
// particles are shipped at the beginning of the iteration,
// we have them marked at that time. since all particles,
// are available on the gpu for these rungs, we do not clear
// the markings when requests are sent out.
CkVec<GenericTreeNode *> markedBuckets;
/// Map of node to index in node vector being sent to the GPU. This is
/// used for remote nodes.
std::unordered_map<NodeKey,int> nodeMap;
std::unordered_map<NodeKey,int> partMap;
bool nodeOffloadReady(){
return nodeLists.totalNumInteractions >= nodeThreshold;
}
bool partOffloadReady(){
return particleLists.totalNumInteractions >= partThreshold;
}
#endif
/// The lowest nodes reached on paths to each bucket
/// Used to find numBuckets completed when
/// walk returns. Also used to find at which
/// bucket computation should start, and which level of cell lists
/// should be used.
GenericTreeNode *lowestNode;
int level;
DoubleWalkState() : chklists(0), lowestNode(0), level(-1) {
#ifdef CUDA
partMap.reserve(100);
#endif
}
#ifdef HAPI_INSTRUMENT_WRS
void nodeListConstructionTimeStart(){
nodeListTime = CmiWallTimer();
}
double nodeListConstructionTimeStop(){
return CmiWallTimer()-nodeListTime;
}
void partListConstructionTimeStart(){
partListTime = CmiWallTimer();
}
double partListConstructionTimeStop(){
return CmiWallTimer()-partListTime;
}
#endif
};
#endif // INTERLIST_VER
class NullState : public State {
};
#endif