-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpaper.bib
483 lines (450 loc) · 24.8 KB
/
paper.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
% BibTeX bibliography file
@InProceedings{lofstead:2011:nessie-staging,
author = {Jay Lofstead and Ron Oldfiend and Todd Kordenbrock and Charles
Reiss},
title = {Extending Scalability of Collective IO Through Nessie and Staging},
booktitle = {The Petascale Data Storage Workshop at Supercomputing},
year = {2011},
month = {November},
address = {Seattle, WA},
abstract = {The increasing fidelity of scientific simulations as they scale
towards exascale sizes is straining the proven IO techniques championed
throughout terascale computing. Chief among the successful IO techniques is
the idea of collective IO where processes coordinate and exchange data prior
to writing to storage in an effort to reduce the number of small, independent
IO operations. As well as collective IO works for efficiently creating a data
set in the canonical order, 3-D domain decompositions prove troublesome due
to the amount of data exchanged prior to writing to storage. When each
process has a tiny piece of a 3-D simulation space rather than a complete
`pencil' or `plane', 2-D or 1-D domain decompositions respectively, the
communication overhead to rearrange the data can dwarf the time spent
actually writing to storage~\cite{MPIcollectiveFix}. Our approach seeks to
transparently increase scalability and performance while maintaining both the
IO routines in the application and the final data format in the storage
system. Accomplishing this leverages both the Nessie~\cite{nessie} RPC
framework and a staging area with staging services. Through these tools, we
employ a variety of data processing operations prior to invoking the native
API to write data to storage yielding as much as a 3$\times$ performance
improvement over the native calls.}
}
@InProceedings{abbasi:2007:datatap,
author = {Abbasi, Hasan and Wolf, Matthew and Schwan, Karsten},
title = {{LIVE} Data Workspace: A Flexible, Dynamic and Extensible Platform
for Petascale Applications},
booktitle = {CLUSTER '07: Proceedings of the 2007 IEEE International
Conference on Cluster Computing},
year = {2007},
pages = {341--348},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA}
}
@Conference{Abbasi:2009:datatap,
author = {Hasan Abbasi and Jay Lofstead and Fang Zheng and Scott Klasky and
Karsten Schwan and Matthew Wolf},
title = {Extending I/O Through High Performance Data Services},
booktitle = {Cluster Computing},
year = {2009},
month = {September},
publisher = {IEEE International},
address = {Luoisiana, LA},
abstract = {The complexity of HPC systems has increased the burden on the
developer as applications scale to hundreds of thousands of processing cores.
I/O processing is one area where extensive efforts are required to achieve
acceptable performance and scalability. A successful approach to high
performance I/O demonstrated by our group and others is to use select nodes
for data staging, where data is evacuated from compute to staging nodes
before being moved to the disk via the file system. This paper shows that I/O
performance can be improved substantially by carefully managing how data
staging is performed, and by enriching such I/O actions through additional
"data services", lightweight abstractions for carrying out data processing
such as transformation, reduction and scheduled storage. We evaluate data
services on actual application codes within our data staging framework for
asynchronous data movement, and we also describe the impact of resource
management in synchronous environments.}
}
@InProceedings{nisar:2008:staging,
author = {Nisar,, Arifa and Liao,, Wei-keng and Choudhary,, Alok},
title = {Scaling Parallel {I/O} Performance Through {I/O} Delegate and
Caching System},
booktitle = {SC '08: Proceedings of the 2008 ACM/IEEE Conference on
Supercomputing},
year = {2008},
pages = {1--12},
publisher = {IEEE Press},
address = {Piscataway, NJ, USA}
}
@InProceedings{zheng:2010:predata,
author = {Fang Zheng and Hasan Abbasi and Ciprian Docan and Jay Lofstead and
Scott Klasky and Qing Liu and Manish Parashar and Norbert Podhorszki and
Karsten Schwan and Matthew Wolf},
title = {{PreDatA }- Preparatory Data Analytics on {Peta-Scale} Machines},
booktitle = {In Proceedings of 24th IEEE International Parallel and
Distributed Processing Symposium, April, Atlanta, Georgia},
year = {2010},
abstract = {Peta-scale scientific applications running on High End Computing
(HEC) platforms can generate large volumes of data. For high performance
storage and in order to be useful to science end users, such data must be
organized in its layout, indexed, sorted, and otherwise manipulated for
subsequent data presentation, visualization, and detailed analysis. In
addition, scientists desire to gain insights into selected data
characteristics `hidden' or `latent' in these massive datasets while data is
being produced by simulations. PreDatA, short for Preparatory Data Analytics,
is an approach to preparing and characterizing data while it is being
produced by the large scale simulations running on peta-scale machines. By
dedicating additional compute nodes on the machine as `staging' nodes and by
staging simulations' output data through these nodes, PreDatA can exploit
their computational power to perform select data manipulations with lower
latency than attainable by first moving data into file systems and storage.
Such intransit manipulations are supported by the PreDatA middleware through
asynchronous data movement to reduce write latency, application-specific
operations on streaming data that are able to discover latent data
characteristics, and appropriate data reorganization and metadata annotation
to speed up subsequent data access. PreDatA enhances the scalability and
exibility of the current I/O stack on HEC platforms and is useful for data
pre-processing, runtime data analysis and inspection, as well as for data
exchange between concurrently running simulations.}
}
@InProceedings{bent:2012:challenges,
author = {Bent, J. and Grider, G. and Kettering, B. and Manzanares, A. and
McClelland, M. and Torres, A. and Torrez, A.},
title = {Storage challenges at Los Alamos National Lab},
booktitle = {Mass Storage Systems and Technologies (MSST), 2012 IEEE 28th
Symposium on},
year = {2012},
month = {April},
pages = {1-5},
keywords = {parallel databases;HPC;IO patterns;Los Alamos national
lab;concurrent write performance;parallel IO;parallel file systems;storage
challenges;usability headaches;Bandwidth;Frequency
measurement;Hardware;Libraries;Servers;Tuning;Usability},
abstract = {There yet exist no truly parallel file systems. Those that make
the claim fall short when it comes to providing adequate concurrent write
performance at large scale. This limitation causes large usability headaches
in HPC. Users need two major capabilities missing from current parallel file
systems. One, they need low latency interactivity. Two, they need high
bandwidth for large parallel IO; this capability must be resistant to IO
patterns and should not require tuning. There are no existing parallel file
systems which provide these features. Frighteningly, exascale renders these
features even less attainable from currently available parallel file systems.
Fortunately, there is a path forward.}
}
@InProceedings{bent:2012:burst-buffer,
author = {Bent, J. and Faibish, S. and Ahrens, J. and Grider, G. and
Patchett, J. and Tzelnic, P. and Woodring, J.},
title = {Jitter-free co-processing on a prototype exascale storage stack},
booktitle = {Mass Storage Systems and Technologies (MSST), 2012 IEEE 28th
Symposium on},
year = {2012},
month = {April},
pages = {1-5},
keywords = {parallel processing;storage management;storage media;IO
forwarding;exascale storage stack;extreme scale high performance
computing;interconnect network;jitter-free coprocessing;parallel file
systems;solid state devices;spindle-based parallel file system;storage
media;Bandwidth;Computational modeling;Conferences;Data analysis;Data
visualization;Radio access networks;USA Councils},
abstract = {In the petascale era, the storage stack used by the extreme scale
high performance computing community is fairly homogeneous across sites. On
the compute edge of the stack, file system clients or IO forwarding services
direct IO over an interconnect network to a relatively small set of IO nodes.
These nodes forward the requests over a secondary storage network to a
spindle-based parallel file system. Unfortunately, this architecture will
become unviable in the exascale era. As the density growth of disks continues
to outpace increases in their rotational speeds, disks are becoming
increasingly cost-effective for capacity but decreasingly so for bandwidth.
Fortunately, new storage media such as solid state devices are filling this
gap; although not cost-effective for capacity, they are so for performance.
This suggests that the storage stack at exascale will incorporate solid state
storage between the compute nodes and the parallel file systems. There are
three natural places into which to position this new storage layer: within
the compute nodes, the IO nodes, or the parallel file system. In this paper,
we argue that the IO nodes are the appropriate location for HPC workloads and
show results from a prototype system that we have built accordingly. Running
a pipeline of computational simulation and visualization, we show that our
prototype system reduces total time to completion by up to 30%.}
}
@InProceedings{lofstead:2012:txn-metadata,
author = {Jay Lofstead and Jai Dayal},
title = {Transactional Parallel Metadata Services for Application
Workdflows},
booktitle = {In Proceedings of High Performance Computing Meets Databases at
Supercomputing},
year = {2012},
abstract = {Scientific simulations have a different relationship with all of
the data generated than many data analysis systems that support applications
like the Large Hadron Collider and the SLOAN Sky Survey. In many cases,
simulations need to generate large number of intermediate data sets that
ultimately are thrown away once some analysis routines are applied to the
data. This generates some summarized, derived result that inspires some
scientific insight. Traditionally, these routines use the storage array to
persist the intermediate results between each step of the data analysis
process. The volume and frequency of this data can be overwhelming compared
with the available IO bandwidth on the machine. To handle this volume and
frequency, current research efforts are determining how to move the storage
of intermediate data from the storage array into the memory of the compute
area. Then, the analysis routines are incorporated to create Integrated
Application Workflows (IAWs). Data staging techniques require some mechanism
to replace the semantics offered by the file system to control data movement
and access. As part of an HPC-focused transaction services project, a first
pass at a transactional metadata service for in compute area data storage is
being developed.}
}
@InProceedings{lofstead:2012:txn,
author = {Jay Lofstead and Jai Dayal and Karsten Schwan and Ron Oldfield},
title = {D2T: Doubly Distributed Transactions for High Performance and
Distributed Computing},
booktitle = {IEEE Cluster Conference},
year = {2012},
month = {September},
address = {Beijing, China},
abstract = {Current exascale computing projections suggest rather than a
monolithic simulation running for the majority of the machine, a collection
of components comprising the scientific discovery process will be employed in
an online workflow. This move to an online workflow scenario requires
knowledge that inter-step operations are completed and correct before the
next phase begins. Further, dynamic load balancing or fault tolerance
techniques may dynamically deploy or redeploy resources for optimal use of
computing resources. These newly configured resources should only be used if
they are successfully deployed. Our D2T system offers a mechanism to support
these kinds of operations by providing database-like transactions with
distributed servers and clients. Ultimately, with adequate hardware support,
full ACID compliance is possible for the transactions. To prove the viability
of this approach, we show that the D2T protocol has less than 1.2 seconds of
overhead using 4096 clients and 32 servers with good scaling characteristics
using this initial prototype implementation.}
}
@InProceedings{dayal:2013:io-containers,
author = {Jai Dayal and Jianting Cao and Greg Eisenhauer and Karsten Schwan
and Matthew Wolf and Fang Zheng and Hasan Abbasi and Scott Klasky and Norbert
Podhorszki and Jay Lofstead},
title = {I/O Containers: Managing the Data Analytics and Visualization
Pipelines of High End Codes},
booktitle = {In Proceedings of International Workshop on High Performance
Data Intensive Computing (HPDIC 2013) held in conjunction with IPDPS 2013},
year = {2013},
address = {Boston, MA},
note = {Best Paper Award},
abstract = {Lack of I/O scalability is known to cause measurable slowdowns
for large-scale scientific applications running on high end machines. This is
prompting researchers to devise 'I/O staging' methods in which outputs are
processed via online analysis and visualization methods to support desired
science outcomes. Organized as online workflows and carried out in I/O
pipelines, these analysis components run concurrently with science
simulations, often using a smaller set of nodes on the high end machine
termed 'staging areas'. This paper presents a new approach to dealing with
several challenges arising for such online analytics, including: how to
efficiently run multiple analytics components on staging area resources
providing them with the levels of end-to-end performance they need and how to
manage staging resources when analytics actions change due to user or
data-dependent behavior. Our approach designs and implements middleware
constructs that delineate and manage I/O pipeline resources called 'I/O
Containers'. Experimental evaluations of containers with realistic scientific
applications demonstrate the feasibility and utility of the approach.}
}
@InProceedings{lofstead:2013:pdsw-txn,
author = {Jay Lofstead and Jai Dayal and Ivo Jimenez and Carlos Maltzahn},
title = {Efficient Transactions for Parallel Data Movement},
booktitle = {The Petascale Data Storage Workshop at Supercomputing},
year = {2013},
month = {November},
address = {Denver, CO},
abstract = {The rise of Integrated Application Workflows (IAWs) for
processing data prior to storage on persistent media prompts the need to
incorporate features that reproduce many of the semantics of persistent
storage devices. One such feature is the ability to manage data sets as
chunks with natural barriers between different data sets. Towards that end,
we need a mechanism to ensure that data moved to an intermediate storage area
is both complete and correct before allowing access by other processing
components. The D2T protocol offers such a mechanism. The initial development
suffered from scalability limitations and undue requirements on server
processes. The current version has addressed these limitations and has
demonstrated scalability with low overhead.}
}
@Article{Lamport:1998:paxos,
author = {Leslie Lamport and Keith Marzullo},
title = {The part-time parliament},
journal = {ACM Transactions on Computer Systems},
year = {1998},
volume = {16},
pages = {133--169}
}
@InProceedings{Hunt:2010:zookeeper,
author = {Patrick Hunt and Mahadev Konar and Flavio P. Junqueira and Benjamin
Reed},
title = {ZooKeeper: Wait-free Coordination for Internet-scale Systems},
booktitle = {In USENIX Annual Technical Conference},
year = {2010}
}
@Misc{barton:2013:fastforward,
key = {Barton},
author = {E. Barton},
title = {Lustre* - Fast Forward to Exascale},
year = {2013},
month = {March},
howpublished = {Lustre User Group Summit 2013},
URL = {www.youtube.com/watch?v=pn_EEbmohDU}
}
@Misc{lombardi:2013:epochs,
key = {epochs},
author = {Johann Lombardi},
title = {High Level Design - Epoch Recovery, June 25th, 2013},
year = {2013},
month = {June},
howpublished = {Intel FastForward Wiki},
URL =
{https://wiki.hpdd.intel.com/download/attachments/12127153/M4.1%20Epoch\_Recovery%20v2.pdf?version=1&modificationDate=1382110631000&api=v2}
}
@InProceedings{burrows:2006:chubby,
author = {Michael Burrows},
title = {The Chubby Lock Service for Loosely-Coupled Distributed Systems},
booktitle = {OSDI},
editor = {Brian N. Bershad and Jeffrey C. Mogul},
year = {2006},
pages = {335-350},
publisher = {USENIX Association}
}
@Article{ganesh:2003:gossip-protocols,
author = {Ganesh, A.J. and Kermarrec, A.-M. and Massoulie, L.},
title = {Peer-to-peer membership management for gossip-based protocols},
journal = {Computers, IEEE Transactions on},
year = {2003},
volume = {52},
number = {2},
pages = {139-149},
keywords = {Internet;multicast protocols;probability;Internet-wide
distributed applications;SCAMP;Scalable Membership
protocol;convergence;decentralized protocol;gossip-based protocols;group
communication;large-scale groups;peer-to-peer membership
management;reliability properties;scalability properties;self-organizing
protocol;Computer crashes;Helium;Internet;Knowledge management;Large-scale
systems;Multicast protocols;Peer to peer
computing;Scalability;Subscriptions;Telecommunication network reliability},
abstract = {Gossip-based protocols for group communication have attractive
scalability and reliability properties. The probabilistic gossip schemes
studied so far typically assume that each group member has full knowledge of
the global membership and chooses gossip targets uniformly at random. The
requirement of global knowledge impairs their applicability to very
large-scale groups. In this paper, we present SCAMP (Scalable Membership
protocol), a novel peer-to-peer membership protocol which operates in a fully
decentralized manner and provides each member with a partial view of the
group membership. Our protocol is self-organizing in the sense that the size
of partial views naturally converges to the value required to support a
gossip algorithm reliably. This value is a function of the group size, but is
achieved without any node knowing the group size. We propose additional
mechanisms to achieve balanced view sizes even with highly unbalanced
subscription patterns. We present the design, theoretical analysis, and a
detailed evaluation of the basic protocol and its refinements. Simulation
results show that the reliability guarantees provided by SCAMP are comparable
to previous schemes based on global knowledge. The scale of the experiments
attests to the scalability of the protocol.}
}
@InProceedings{zhang:2010:zfs,
author = {Yupu Zhang and Abhishek Rajimwale and Andrea C. Arpaci-Dusseau and
Remzi H. Arpaci-Dusseau},
title = {End-to-end Data Integrity for File Systems: A ZFS Case Study},
booktitle = {FAST},
editor = {Randal C. Burns and Kimberly Keeton},
year = {2010},
pages = {29-42},
publisher = {USENIX}
}
@misc{fastforward:2014:docs,
title={FastForward Storage and I/O Stack Design Documents},
HowPublished={Intel FastForward Wiki},
year={2014},
month={February},
note={https://wiki.hpdd.intel.com/display/PUB/Fast+Forward+Storage+and\\+IO+Program+Documents},
key={FastForward},
}
@string{procof = {Proceedings of}}
@string{procofthe = procof # { the }}
@string{cluster = {IEEE International Conference on Cluster Computing}}
@string{cluster2006 = procofthe # cluster}
@inproceedings{oldfield:lwfs,
Abstract = {Today?s high-end massively parallel processing (MPP) machines have
thousands to tens of thousands of processors, with next-generation
systems planned to have in excess of one hundred thousand processors.
For systems of such scale, efficient I/O is a significant challenge
that cannot be solved using traditional approaches. In particular,
general purpose parallel file systems that limit applications to
standard interfaces and access policies do not scale and will likely
be a performance bottleneck for many scientific applications.
In this paper, we investigate the use of a ?lightweight? approach
to I/O that requires the application or I/O-library developer to
extend a core set of critical I/O functionality with the minimum
set of features and services required by its target applications.
We argue that this approach allows the development of I/O libraries
that are both scalable and secure. We support our claims with preliminary
results for a lightweight checkpoint operation on a development cluster
at Sandia.},
Address = {Barcelona, Spain},
Author = {Ron A. Oldfield and Arthur B. Maccabe and Sarala Arunagiri and Todd Kordenbrock and Rolf Riesen and Lee Ward and Patrick Widener},
Booktitle = cluster2006,
Comment = {Also see extended version raoldfi:lwfs-tr.},
Date-Modified = {2011-03-31 11:35:20 -0600},
Doi = {10.1109/CLUSTR.2006.311853},
File = {SAND2006-3057.pdf:http\://gaston.sandia.gov/cfupload/ccim_pubs_prod/SAND2006-3057.pdf:PDF},
Institution = {Sandia National Laboratories},
Keywords = {lightweight storage, checkpoint, scalable-io, LWFS, pario-bib},
Month = sep,
Owner = {raoldfi},
Timestamp = {2006.05.15},
Title = {Lightweight {I/O} for Scientific Applications},
Url = {http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2006.311853},
Vitatype = {refConference},
Year = {2006},
Bdsk-Url-1 = {http://gaston.sandia.gov/cfupload/ccim_pubs_prod/SAND2006-3057.pdf}}
@inproceedings{weil:2006:ceph,
Address = {Seattle, WA},
Author = {Sage A. Weil and Scott A. Brandt and Ethan L. Miller and Darrell D. E. Long and Carlos Maltzahn},
Booktitle = {OSDI'06},
Month = nov,
Title = {{Ceph}: A Scalable, High-Performance Distributed File System},
Year = 2006}
@ONLINE{hdf5,
author = {{The HDF Group}},
title = "{Hierarchical data format version 5}",
year = {2000-2014},
note = {http://www.hdfgroup.org/HDF5}
}
@INPROCEEDINGS{nowoczynski:2008:zest,
author={Nowoczynski, P. and Stone, N. and Yanovich, J. and Sommerfield, J.},
booktitle={Petascale Data Storage Workshop, 2008. PDSW '08. 3rd}, title={Zest Checkpoint storage system for large supercomputers},
year={2008},
month={nov.},
volume={},
number={},
pages={1 -5},
abstract={The PSC has developed a prototype distributed file system infrastructure that vastly accelerates aggregated write bandwidth on large compute platforms. Write bandwidth, more than read bandwidth, is the dominant bottleneck in HPC I/O scenarios due to writing checkpoint data, visualization data and post-processing (multi-stage) data. We have prototyped a scalable solution that will be directly applicable to future petascale compute platforms having of order 10^6 cores. Our design emphasizes high-efficiency scalability, low-cost commodity components, lightweight software layers, end-to-end parallelism, client-side caching and software parity, and a unique model of load-balancing outgoing I/O onto high-speed intermediate storage followed by asynchronous reconstruction to a 3rd-party parallel file system.},
keywords={HPC I-O scenarios;asynchronous reconstruction;checkpoint storage system;client-side caching;data checkpoint;data visualization;end-to-end parallelism;high-speed intermediate storage;load-balancing;parallel file system;petascale compute platforms;post-processing data;prototype distributed file system infrastructure;software layers;software parity;checkpointing;data visualisation;input-output programs;mainframes;parallel processing;program verification;resource allocation;},
doi={10.1109/PDSW.2008.4811883},
ISSN={},}
@misc{JohnBent,
author = "John Bent",
title = "2014-03-21",
howpublished = "personal communication"
}
@misc{QuinceyKoziol,
author = "Quincey Koziol",
title = "2014-03-21",
howpublished = "personal communication"
}
@misc{EricBarton,
author = "Eric Barton",
title = "2014-03-21",
howpublished = "personal communication"
}
@inproceedings{skourtis:2013:ssd-performance,
address = {New York, {NY}, {USA}},
series = {{INFLOW} '13},
title = {High Performance \& Low Latency in Solid-state Drives Through Redundancy},
isbn = {978-1-4503-2462-5},
url = {http://doi.acm.org/10.1145/2527792.2527798},
doi = {10.1145/2527792.2527798},
booktitle = {Proceedings of the 1st Workshop on Interactions of {NVM/FLASH} with Operating Systems and Workloads},
publisher = {{ACM}},
author = {Skourtis, Dimitris and Achlioptas, Dimitris and Maltzahn, Carlos and Brandt, Scott},
year = {2013},
keywords = {performance, {QoS}, solid-state drives, storage virtualization},
pages = {6:1–6:9}
}