@Article{ 2016arxiv160300491d,
author = {{Dukhan}, M. and {Vuduc}, R. and {Riedy}, J.},
title = "{Wanted: Floating-Point Add Round-off Error instruction}",
journal = {ArXiv e-prints},
archiveprefix = "arXiv",
eprint = {1603.00491},
primaryclass = "cs.NA",
keywords = {Computer Science - Numerical Analysis, Computer Science -
Performance},
year = 2016,
month = mar,
adsurl = {http://adsabs.harvard.edu/abs/2016arXiv160300491D},
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
url = {http://arxiv.org/abs/1603.00491}
}
@Misc{ acs-2017,
author = {Jason Riedy},
title = {High-Performance Analysis of Streaming Graphs},
howpublished = {HPC Analytic Workshop},
month = jun,
year = 2017,
dom = 28,
address = {Hanover, MD},
url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs-77348572}
}
@Misc{ an12-streaming-ms,
file = {material/siam-an-2012.pdf},
author = {David A. Bader and David Ediger and Jason Riedy},
ejr-withauthor= {David A. Bader and David Ediger},
title = {Streaming Graph Analytics for Massive Graphs},
howpublished = {SIAM Annual Meeting},
dom = 10,
month = jul,
year = {2012},
url = {http://www.slideshare.net/jasonriedy/streaming-graph-analytics-for-massive-graphs},
role = {presentation},
tags = {siam; streaming data; parallel algorithms},
address = {Minneapolis, MN},
abstract = {Emerging real-world graph problems include detecting
community structure in large social networks, improving the
resilience of the electric power grid, and detecting and
preventing disease in human populations. The volume and
richness of data combined with its rate of change renders
monitoring properties at scale by static recomputation
infeasible. We approach these problems with massive,
fine-grained parallelism across different shared memory
architectures both to compute solutions and to explore the
sensitivity of these solutions to natural bias and
omissions within the data.}
}
@InProceedings{ arith-lang,
author = {David Hough and Bill Hay and Jeff Kidder and E. Jason
Riedy and Guy L. Steele Jr. and Jim Thomas},
ejr-withauthor= {David Hough and Bill Hay and Jeff Kidder and Guy L. Steele
Jr. and Jim Thomas},
title = {Arithmetic Interactions: From Hardware to Applications},
booktitle = {17th {IEEE} Symposium on Computer Arithmetic
({ARITH}'05)},
year = {2005},
dom = 28,
month = jun,
note = {See
\href{http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf}{related
presentation}},
isbn = "0-7695-2366-8",
role = {proceedings; panel},
tags = {ieee754; floating point},
doi = {10.1109/ARITH.2005.10},
abstract = {The entire process of creating and executing applications
that solve interesting problems with acceptable cost and
accuracy involves a complex interaction among hardware,
system software, programming environments, mathematical
software libraries, and applications software, all mediated
by standards for arithmetic, operating systems, and
programming environments. This panel will discuss various
issues arising among these various contending points of
view, sometimes from the point of view of issues raised
during the current IEEE 754R standards revision effort.}
}
@TechReport{ axb-itref-lawn,
author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S.
Li and Sonil Mukherjee and E. Jason Riedy},
ejr-withauthor= {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S.
Li and Sonil Mukherjee},
title = {Error bounds from extra-precise iterative refinement},
type = {LAPACK Working Note},
institution = {Netlib},
year = {2005},
number = {165},
lawn = {165},
month = feb,
dom = 3,
role = {techreport},
tags = {lawn; lapack; linear algebra; floating point},
other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
note = {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and
LBNL-56965; expanded from TOMS version},
url = {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf},
file = {material/lawn165.pdf},
abstract = {We present the design and testing of an algorithm for
iterative refinement of the solution of linear equations,
where the residual is computed with extra precision. This
algorithm was originally proposed in the 1960s [6, 22] as a
means to compute very accurate solutions to all but the
most ill-conditioned linear systems of equations. However
two obstacles have until now prevented its adoption in
standard subroutine libraries like LAPACK: (1) There was no
standard way to access the higher precision arithmetic
needed to compute residuals, and (2) it was unclear how to
compute a reliable error bound for the computed solution.
The completion of the new BLAS Technical Forum Standard [5]
has recently removed the first obstacle. To overcome the
second obstacle, we show how a single application of
iterative refinement can be used to compute an error bound
in any norm at small cost, and use this to compute both an
error bound in the usual infinity norm, and a componentwise
relative error bound. We report extensive test results on
over 6.2 million matrices of dimension 5, 10, 100, and
1000. As long as a normwise (resp. componentwise) condition
number computed by the algorithm is less than $1 /
\operatorname{max}\{10, \sqrt{n}\}\varepsilon_w$ , the
computed normwise (resp. componentwise) error bound is at
most $2 \operatorname{max}\{10, \sqrt{n}\} \cdot
\varepsilon_w$ , and indeed bounds the true error. Here,
$n$ is the matrix dimension and $\varepsilon_w$ is single
precision roundoff error. For worse conditioned problems,
we get similarly small correct error bounds in over 89.4\% of cases.}
}
@Article{ axb-itref-toms,
author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S.
Li and Sonil Mukherjee and E. Jason Riedy},
ejr-withauthor= {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S.
Li and Sonil Mukherjee},
title = {Error bounds from extra-precise iterative refinement},
journal = {{ACM} Transactions on Mathematical Software},
year = {2006},
volume = {32},
number = {2},
pages = {325--351},
month = jun,
role = {refereed},
tags = {acm; toms; lapack; floating point; linear algebra},
doi = {10.1145/1141885.1141894},
issn = {0098-3500},
mrclass = {65F10},
mrnumber = {2272365},
abstract = {We present the design and testing of an algorithm for
iterative refinement of the solution of linear equations
where the residual is computed with extra precision. This
algorithm was originally proposed in 1948 and analyzed in
the 1960s as a means to compute very accurate solutions to
all but the most ill-conditioned linear systems. However,
two obstacles have until now prevented its adoption in
standard subroutine libraries like LAPACK: (1) There was no
standard way to access the higher precision arithmetic
needed to compute residuals, and (2) it was unclear how to
compute a reliable error bound for the computed solution.
The completion of the new BLAS Technical Forum Standard has
essentially removed the first obstacle. To overcome the
second obstacle, we show how the application of iterative
refinement can be used to compute an error bound in any
norm at small cost and use this to compute both an error
bound in the usual infinity norm, and a componentwise
relative error bound.},
gt-role = {Assisted in software development, experimental methods,
writing, and editing.},
file = {material/gesvxx.pdf}
}
@Misc{ bascd2002-poster,
author = {E. Jason Riedy},
title = {Parallel Bipartite Matching for Sparse Matrix
Computation},
howpublished = {Third Bay Area Scientific Computing Day},
month = mar,
year = {2002},
role = {poster},
address = {Livermore, CA},
tags = {bascd; sparse matrix; combinatorial optimization; parallel
algorithms}
}
@Misc{ bascd2006-poster,
author = {E. Jason Riedy},
title = {Making Static Pivoting Dependable},
howpublished = {Seventh Bay Area Scientific Computing Day},
month = mar,
year = {2006},
role = {poster},
address = {Livermore, CA},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2006-poster.pdf},
file = {material/bascd2006-poster.pdf},
tags = {bascd; sparse matrix; linear algebra},
abstract = {For sparse LU factorization, dynamic pivoting tightly
couples symbolic and numerical computation. Dynamic
structural changes limit parallel scalability. Demmel and
Li use static pivoting in distributed SuperLU for
performance, but intentionally perturbing the input may
lead silently to erroneous results. Are there
experimentally stable static pivoting heuristics that lead
to a dependable direct solver? The answer is currently a
qualified yes. Current heuristics fail on a few systems,
but all failures are detectable.}
}
@Misc{ bascd2007-poster,
author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E.
Jason Riedy and Meghana Vishvanath and David Vu},
ejr-withauthor= {James W. Demmel and Yozo Hida and Xiaoye S. Li and Meghana
Vishvanath and David Vu},
title = {Precise Solutions for Overdetermined Least Squares
Problems},
howpublished = {Stanford 50 -- Eighth Bay Area Scientific Computing Day},
month = mar,
year = {2007},
role = {poster},
address = {Stanford, CA},
tags = {bascd; least squares},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2007-poster.pdf},
abstract = {Linear least squares (LLS) fitting is the most widely used
data modeling technique and is included in almost every
data analysis system (e.g. spreadsheets). These software
systems often give no feedback on the conditioning of the
LLS problem or the floating-point calculation errors
present in the solution. With limited use of extra
precision, we can eliminate these concerns for all but the
most ill-conditioned LLS problems. Our algorithm provides
either a solution and residual with relatively tiny error
or a notice that the LLS problem is too ill-conditioned.}
}
@InProceedings{ bc-hpec14,
author = {Adam McLaughlin and Jason Riedy and David A. Bader},
ejr-withauthor= {Adam McLaughlin and David A. Bader},
title = {Optimizing Energy Consumption and Parallel Performance for
Betweenness Centrality using {GPU}s},
tags = {parallel; graph; energy},
booktitle = {The IEEE High Performance Extreme Computing Conference
(HPEC)},
year = 2014,
month = sep,
address = {Waltham, MA},
note = {``Rising Stars'' section},
dom = 11,
role = {proceedings},
doi = {10.1109/HPEC.2014.7040980},
file = {material/Optimizing_BC_HPEC14.pdf},
abstract = {Applications of high-performance graph analysis range from
computational biology to network security and even
transportation. These applications often consider graphs
under rapid change and are moving beyond HPC platforms into
energy-constrained embedded systems. This paper optimizes
one successful and demanding analysis kernel, betweenness
centrality, for NVIDIA GPU accelerators in both
environments. Our algorithm for static analysis is capable
of exceeding 2 million traversed edges per second per watt
(MTEPS/W). Optimizing the parallel algorithm and treating
the dynamic problem directly achieves a 6.39$\times$
average speed-up and 84\% average reduction in energy
consumption.}
}
@Misc{ blas-ng-feb-2017,
author = {James Demmel and Greg Henry and Xiaoye Li and Jason Riedy
and Peter Tang},
title = {A Proposal for a Next-Generation BLAS},
howpublished = {Workshop on Batched, Reproducible, and Reduced Precision
BLAS},
month = feb,
year = 2017,
dom = 24,
address = {Atlanta, Georgia},
url = {http://www.netlib.org/utk/people/JackDongarra/WEB-PAGES/Batched-BLAS-2017/talk05-demmel.pdf}
}
@InProceedings{ caa16,
author = {David Bader and Aleksandra Michalewicz and Oded Green and
Jessie Birkett-Rees and Jason Riedy and James Fairbanks and
Anita Zakrzewska},
title = {Semantic database applications at the Samtavro Cemetery,
Georgia},
booktitle = {The 44th Computer Applications and Quantitative Methods in
Archaeology Conference ({CAA})},
year = 2016,
month = mar,
dom = 30,
address = {Oslo, Norway},
abstract = {In 2013 a paper was offered to the CAA concerning
archaeological legacy data and semantic database
applications, with some preliminary results for a study
conducted into the Samtavro cemetery, situated in the South
Caucasus in the modern republic of Georgia. The present
paper presents further research outcomes of data mining the
Samtavro material. Over four thousand graves were excavated
at this site, used most intensively during the Late Bronze
and Iron Ages, and later in the Roman and Late Antique
periods. The current project focuses on the latter
period—and the legacy of Soviet and post-Soviet
excavations—in a collaborative effort between computer
scientists based at the Georgia Institute of Technology,
USA, and archaeologists at the University of Melbourne and
Monash University, Australia.
Data for 1075 tombs, 1249 individuals, and 5842 grave
accoutrements were collected across 74 data fields,
resulting in the identification of 9 tomb types, 37
artefact types and 320 artefact subtypes. Methods tested
against the Samtavro material culture included the
application of clustering techniques to understand
associations of related items based on patterns of
co-occurrence, using traditional data mining (hierarchical
link clustering) and spectral graph theory—focusing on
tomb types in relation to artefact types. The other method
calculated the probability of each event occurring and
comparing this to what we would expect if these were truly
random—focusing on artefact types in relation to
biological sex and age brackets.
In some instances, our work confirmed previously
established relationships, but it likewise revealed new
results concerning particular entities. The project
demonstrates that although sites for which comprehensive
archival records exist can benefit from these types of
approaches, often the greatest limitation in taking a
‘big data’ approach is the relative scarcity of
archaeological data.}
}
@Misc{ cerfacs08,
file = {material/cerfacs08.pdf},
author = {E. Jason Riedy},
title = {Auctions for Distributed (and Possibly Parallel)
Matchings},
howpublished = {Visit to \href{http://www.cerfacs.fr/}{CERFACS} courtesy
of the Franco-Berkeley Fund},
dom = {17},
month = dec,
year = {2008},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cerfacs08.pdf},
tags = {cerfacs; combinatorial optimization; sparse matrix},
role = {presentation}
}
@InProceedings{ cmg.2014,
file = {material/cmg-2014-11-04.pdf},
author = {Jason Riedy and David A. Bader},
title = {Graph Analysis Trends and Opportunities},
booktitle = {CMG Performance and Capacity},
year = 2014,
month = nov,
dom = 4,
address = {Atlanta, GA},
note = {Invited presentation},
tags = {parallel; graph; streaming},
url = {http://www.slideshare.net/jasonriedy/cmg-20141104},
abstract = {High-performance graph analysis is unlocking knowledge in
problems like anomaly detection in computer security,
community structure in social networks, and many other data
integration areas. While graphs provide a convenient
abstraction, real-world problems' sparsity and lack of
locality challenge current systems. This talk will cover
current trends ranging from massive scales to low-power,
low-latency systems and summarize opportunities and
directions for graphs and computing systems.}
}
@Misc{ comb-sparse-cse05,
author = {E. Jason Riedy},
title = {Parallel Combinatorial Computing and Sparse Matrices},
howpublished = {SIAM Conference on Computational Science and Engineering},
dom = {14},
month = feb,
year = {2005},
role = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cse05.pdf},
file = {material/cse05.pdf},
tags = {combinatorial optimization; sparse matrix; parallel
algorithms; siam},
abstract = {Increasingly, sparse matrix applications produce matrices
too large for a single computer's memory. Distributed,
parallel computers provide an avenue around memory
limitations, but distributing combinatorial algorithms is
historically difficult. We use insights from combinatorial
optimization to design loosely coupled algorithms for
sparse matrix matching, ordering, and symbolic
factorization. These algorithms' performance depends on
both problem instance and computer architecture. We
investigate these aspects of performance and demonstrate
issues that affect distributed combinatorial computing.}
}
@Misc{ cse17-streaming-ms,
author = {E. Jason Riedy},
title = {High-Performance Analysis of Streaming Graphs},
howpublished = {SIAM Conference on Computational Science and Engineering},
dom = 2,
month = mar,
year = 2017,
role = {presentation},
url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs},
tags = {siam; streaming data; parallel algorithms},
address = {Atlanta, GA},
abstract = { Graph-structured data in social networks, finance,
network security, and others not only are massive but also
under continual change. These changes often are scattered
across the graph. Stopping the world to run a single,
static query is infeasible. Repeating complex global
analyses on massive snapshots to capture only what has
changed is inefficient. We discuss requirements for
single-shot queries on changing graphs as well as recent
high-performance algorithms that update rather than
recompute results. These algorithms are incorporated into
our software framework for streaming graph analysis, STING
(Spatio-Temporal Interaction Networks and Graphs).}
}
@InProceedings{ dblp:conf/dimacs/riedymeb12,
file = {material/dimacs10-proceedings-community-detection.pdf},
title = {Parallel community detection for massive graphs},
author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and
David A. Bader},
booktitle = {Graph Partitioning and Graph Clustering},
year = {2012},
editor = {David A. Bader and Henning Meyerhenke and Peter Sanders
and Dorothea Wagner},
pages = {207--222},
publisher = {American Mathematical Society},
series = {Contemporary Mathematics},
volume = {588},
abstract = {Tackling the current volume of graph-structured data
requires parallel tools. We extend our work on analyzing
such massive graph data with a massively parallel algorithm
for community detection that scales to current data sizes,
clustering a real-world graph of over 100 million vertices
and over 3 billion edges in under 500 seconds on a
four-processor Intel E7-8870-based server. Our algorithm
achieves moderate parallel scalability without sacrificing
sequential operational complexity. Community detection
partitions a graph into subgraphs more densely connected
within the subgraph than to the rest of the graph. We take
an agglomerative approach similar to Clauset, Newman, and
Moore’s sequential algorithm, merging pairs of connected
intermediate subgraphs to optimize different graph
properties. Working in parallel opens new approaches to
high performance. We improve performance of our parallel
community detection algorithm on both the Cray XMT2 and
OpenMP platforms and adapt our algorithm to the DIMACS
Implementation Challenge data set.},
bibsource = {DBLP, http://dblp.uni-trier.de},
doi = {10.1090/conm/588/11703},
ee = {http://www.ams.org/books/conm/588/11703},
optisbn = {978-0-8218-9038-7, 978-0-8218-9869-7},
isbn = {978-0-8218-9038-7},
opttitle = {Graph Partitioning and Graph Clustering - 10th DIMACS
Implementation Challenge Workshop, Georgia Institute of
Technology, Atlanta, GA, USA, February 13-14, 2012.
Proceedings},
url = {http://www.ams.org/books/conm/588/11703/conm588-11703.pdf}
}
@InCollection{ dimacs10-workshop,
author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and
David A. Bader},
ejr-withauthor= {Henning Meyerhenke and David Ediger and David A. Bader},
title = {Parallel Community Detection for Massive Graphs},
booktitle = {10th DIMACS Implementation Challenge Workshop - Graph
Partitioning and Graph Clustering},
tags = {parallel; graph; community detection},
publisher = {(workshop paper)},
year = 2012,
month = feb,
dom = 14,
address = {Atlanta, Georgia},
note = {Won first place in the Mix Challenge and Mix Pareto
Challenge},
url = {http://www.cc.gatech.edu/dimacs10/papers/\&\#91;15\&\#93;-dimacs10-community-detection.pdf},
file = {material/dimacs10-community-detection.pdf},
abstract = {Tackling the current volume of graph-structured data
requires parallel tools. We extend our work on analyzing
such massive graph data with a massively parallel algorithm
for community detection that scales to current data sizes,
clustering a real-world graph of over 100 million vertices
and over 3 billion edges in under 500 seconds on a four-
processor Intel E7-8870-based server. Our algorithm
achieves moderate parallel scalability without sacrificing
sequential operational complexity. Community detection
partitions a graph into subgraphs more densely connected
within the subgraph than to the rest of the graph. We take
an agglomerative approach similar to Clauset, Newman, and
Moore’s sequential algorithm, merging pairs of connected
intermediate subgraphs to optimize different graph
properties. Working in parallel opens new approaches to
high performance. We improve performance of our parallel
community detection algorithm on both the Cray XMT2 and
OpenMP platforms and adapt our algorithm to the DIMACS
Implementation Challenge data set.}
}
@Misc{ dmml-2015,
file = {material/dmml-2015-ejr.pdf},
author = {E. Jason Riedy},
title = {Graph Analysis Beyond Linear Algebra},
howpublished = {Development of Modern Methods for Linear Algebra},
month = oct,
dom = 24,
year = {2015},
abstract = {High-performance graph analysis is unlocking knowledge in
computer security, bioinformatics, social networks, and
many other data integration areas. Graphs provide a
convenient abstraction for many data problems beyond linear
algebra. Some problems map directly to linear algebra.
Others, like community detection, look eerily similar to
sparse linear algebra techniques. And then there are
algorithms that strongly resist attempts at making them
look like linear algebra. This talk will cover recent
results with an emphasis on streaming graph problems where
the graph changes and results need updated with minimal
latency. We’ll also touch on issues of sensitivity and
reliability where graph analysis needs to learn from
numerical analysis and linear algebra.},
role = {Invited presentation},
url = {http://www.slideshare.net/jasonriedy/graph-analysis-beyond-linear-algebra}
}
@Unpublished{ fp-type-project,
author = {E. Jason Riedy},
title = {Type System Support for Floating-Point Computation},
month = may,
dom = 25,
file = {material/type-support-for-fp.pdf},
abstract = {Floating-point arithmetic is often seen as untrustworthy.
We show how manipulating precisions according to the
following rules of thumb enhances the reliability of and
removes surprises from calculations: Store data narrowly,
compute intermediates widely, and derive properties widely.
Further, we describe a typing system for floating point
that both supports and is supported by these rules. A
single type is established for all in- termediate
computations. The type describes a precision at least as
wide as all inputs to and results from the computation.
Picking a single type provides benefits to users,
compilers, and interpreters. The type system also extends
cleanly to encompass intervals and higher precisions.},
year = {2001},
role = {unpublished},
tags = {programming language; floating point; ieee754}
}
@InProceedings{ gabb16-pr,
author = {Jason Riedy},
title = {Updating PageRank for Streaming Graphs},
booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB} 2016)},
year = 2016,
dom = 23,
month = may,
address = {Chicago, IL},
note = {(Workshop with IPDPS 2016)},
tags = {parallel; graph; streaming; pagerank},
abstract = {Incremental graph algorithms can respond quickly to small
changes in massive graphs by updating rather than
recomputing analysis metrics. Here we use the linear system
formulation of PageRank and ideas from iterative refinement
to compute the update to a PageRank vector accurately and
quickly. The core idea is to express the residual of the
original solution with respect to the updated matrix
representing the graph. The update to the residual is
sparse. Solving for the solution update with a
straight-forward iterative method spreads the change
outward from the change locations but converges before
traversing the entire graph. We achieve speed-ups of
2$\times$ to over 40$\times$ relative to a restarted,
highly parallel PageRank iteration for small, low-latency
batches of edge insertions. These cases traverse 2$\times$
to nearly 10\,000$\times$ fewer edges than the restarted
PageRank iteration. This provides an interesting test case
for the ongoing GraphBLAS effort: Can the APIs support our
incremental algorithms cleanly and efficiently?},
file = {material/streaming-pagerank-gabb2016.pdf}
}
@Unpublished{ graph500-1.1,
author = {David A. Bader and Jonathan Berry and Simon Kahan and
Richard Murphy and E. Jason Riedy and Jeremiah Willcock},
ejr-withauthor= {David A. Bader and Jonathan Berry and Simon Kahan and
Richard Murphy and Jeremiah Willcock},
title = {Graph 500 Benchmark 1 ("Search")},
note = {Version 1.1},
url = {http://www.graph500.org/Specifications.html},
month = oct,
year = 2010
}
@Article{ graphct-tpds-2012,
author = {David Ediger and Karl Jiang and Jason Riedy and David A.
Bader},
ejr-withauthor= {David Ediger and Karl Jiang and David A. Bader},
title = {{GraphCT}: Multithreaded Algorithms for Massive Graph
Analysis},
journal = {{IEEE} Transactions in Parallel and Distributed Systems},
year = 2013,
pages = {2220 -- 2229},
month = sep,
doi = {10.1109/TPDS.2012.323},
issn = {1045-9219},
opturl = {http://dx.doi.org/10.1109/TPDS.2012.323},
role = {refereed},
gt-role = {Assisted in software development, experimental methods,
writing, and editing.},
abstract = {The digital world has given rise to massive quantities of
data that include rich semantic and complex networks. A
social graph, for example, containing hundreds of millions
of actors and tens of billions of relationships is not
uncommon. Analyzing these large data sets, even to answer
simple analytic queries, often pushes the limits of
algorithms and machine architectures. We present GraphCT, a
scalable framework for graph analysis using parallel and
multithreaded algorithms on shared memory platforms.
Utilizing the unique characteristics of the Cray XMT,
GraphCT enables fast network analysis at unprecedented
scales on a variety of input data sets. On a synthetic
power law graph with 2 billion vertices and 17 billion
edges, we can find the connected components in 2 minutes.
We can estimate the betweenness centrality of a similar
graph with 537 million vertices and over 8 billion edges in
under 1 hour. GraphCT is built for portability and
performance.},
file = {material/GraphCT-IEEE.pdf}
}
@InCollection{ graphct-wiley-chap,
file = {material/STINGER-with-apps.pdf},
title = {Computational Graph Analytics for Massive Streaming Data},
author = {David Ediger and Jason Riedy and David A. Bader and
Henning Meyerhenke},
booktitle = {Large Scale Network-Centric Computing Systems},
publisher = {Wiley},
year = {2013},
chapter = {25},
editor = {Hamid Sarbazi-azad and Albert Zomaya},
month = jul,
series = {Parallel and Distributed Computing},
abstract = {Handling the constant stream of data from health care,
security, business, and social network applications
requires new algorithms and data structures. We present a
new approach for parallel massive analysis of streaming,
temporal, graph-structured data. For this purpose we
examine data structure and algorithm trade-offs that
extract the parallelism necessary for high-performance
updating analysis of massive graphs. As a result of this
study, we propose the extensible and flexible data
structure for massive graphs called STINGER
({S}patio-{T}emporal {I}nteraction {N}etworks and {G}raphs
{E}xtensible {R}epresentation). Two case studies
demonstrate our new approach's effectiveness. The first one
computes a dynamic graph's vertices' \emph{clustering
coefficients}. We show that incremental updates are far
more efficient than global recomputation. Within this
kernel, we compare three methods for dynamically updating
local clustering coefficients: a brute-force local
recalculation, a sorting algorithm, and our new
approximation method using a Bloom filter. On 32 processors
of a \xmt{} with a synthetic scale-free graph of $2^{24}
\approx 16$ million vertices and $2^{29} \approx 537$
million edges, the brute-force method processes a mean of
over 50\,000 updates per second, while our Bloom filter
approaches 200\,000 updates per second. The second case
study monitors a global feature, a dynamic graph's
connected components. We use similar algorithmic ideas as
before to exploit the parallelism in the problem and
provided by the hardware architecture. On a 16 million
vertex graph, we obtain rates of up to 240\,000 updates per
second on 32 processors of a \xmt{}. For the large
scale-free graphs typical in our applications, our
implementation uses novel batching techniques that exploit
the scale-free nature of the data and run over three times
faster than prior methods. Our new framework is the first
to handle real-world data rates, opening the door to
higher-level analytics such as community and anomaly detection.},
dom = {30},
ejr-withauthor= {David Ediger and David A. Bader and Henning Meyerhenke},
isbn = {978-0470936887},
role = {chapter},
tags = {parallel; graph; streaming},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6674282&tag=1}
}
@Misc{ graphex11,
author = {Jason Riedy and David Ediger and David A. Bader and
Henning Meyerhenke},
ejr-withauthor= {David Ediger and David A. Bader and Henning Meyerhenke},
title = {Tracking Structure of Streaming Social Networks},
dom = 9,
month = aug,
year = 2011,
optrole = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GraphEx-2011.pdf},
file = {material/GraphEx-2011.pdf},
tags = {graph; streaming},
howpublished = {2011 Graph Exploitation Symposium hosted by MIT Lincoln
Labs}
}
@Misc{ graphlab-2013,
file = {material/graphlab13-poster.pdf},
author = {Jason Riedy},
title = {{STINGER}: Analyzing massive, streaming graphs},
howpublished = {2nd GraphLab Workshop},
month = jul,
dom = 1,
year = 2013,
address = {San Francisco, CA},
role = {invited poster and demo},
tags = {graph analysis; parallel algorithms}
}
@Misc{ graphlab-2014,
file = {material/graphlab14-poster.pdf},
author = {Jason Riedy},
title = {{STINGER}: Analyzing massive, streaming graphs},
howpublished = {3rd GraphLab Workshop},
month = jul,
dom = 21,
year = 2014,
address = {San Francisco, CA},
role = {invited poster and demo},
tags = {graph analysis; parallel algorithms}
}
@Misc{ gt09,
file = {material/gt-2009-08-21.pdf},
author = {E. Jason Riedy},
title = {Dependable direct solutions for linear systems using a
little extra precision},
howpublished = {CSE Seminar at Georgia Institute of Technology},
dom = {21},
month = aug,
year = 2009,
url = {http://hdl.handle.net/1853/29795},
tags = {linear algebra; floating point; lapack},
role = {presentation},
abstract = {Solving a square linear system $Ax=b$ often is considered
a black box. It's supposed to "just work," and failures
often are blamed on the original data or subtleties of
floating-point. Now that we have an abundance of cheap
computations, however, we can do much better. A little
extra precision in just the right places produces accurate
solutions cheaply or demonstrates when problems are too
hard to solve without significant cost. This talk will
outline the method, iterative refinement with a new twist;
the benefits, small backward and forward errors; and the
trade-offs and unexpected benefits.}
}
@Article{ holder:2016:cfc:2980765.2980770,
author = {Holder, Lawrence B. and Caceres, Rajmonda and Gleich,
David F. and Riedy, Jason and Khan, Maleq and Chawla,
Nitesh V. and Kumar, Ravi and Wu, Yinghui and Klymko,
Christine and Eliassi-Rad, Tina and Prakash, Aditya},
title = {Current and Future Challenges in Mining Large Networks:
Report on the Second SDM Workshop on Mining Networks and
Graphs},
journal = {SIGKDD Explorations Newsletter},
issue_date = {June 2016},
volume = {18},
number = {1},
month = aug,
year = {2016},
issn = {1931-0145},
pages = {39--45},
numpages = {7},
doi = {10.1145/2980765.2980770},
acmid = {2980770},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Network mining, big data, challenges, graph mining}
}
@Misc{ hpcs-panel-2012,
optkey = {},
author = {Lauren L. Smith and Dolores A. Shaffer},
title = {{DARPA}'s {H}igh {P}roductivity {C}omputing {S}ystems
Program: A Final Report},
howpublished = {Supercomputing Birds-of-a-Feather session},
dom = 14,
month = nov,
year = 2012,
note = {Invited panel speaker},
abstract = {The DARPA High Productivity Computing Systems (HPCS)
program has been focused on providing a new generation of
economically viable high productivity computing systems for
national security, scientific, industrial and commercial
applications. This program was unique because it focused on
system productivity that was defined to include enhancing
performance, programmability, portability, usability,
manageability and robustness of systems as opposed to just
being focused on one execution time performance metric. The
BOF is for anyone interested in learning about the two HPCS
systems and how productivity in High Performance Computing
has been enhanced.},
optannote = {}
}
@InProceedings{ hpec15,
author = {Adam McLaughlin and Jason Riedy and David A. Bader},
title = {An Energy-Efficient Abstraction for Simultaneous
Breadth-First Searches},
ejr-withauthor= {Adam McLaughlin and David A. Bader},
tags = {parallel; graph; energy},
booktitle = {The IEEE High Performance Extreme Computing Conference
(HPEC)},
year = 2015,
month = sep,
address = {Waltham, MA},
dom = 17,
role = {proceedings},
abstract = {Optimized GPU kernels are sufficiently complicated to
write that they often are specialized to specific input
data, target architectures, or applications. This paper
presents a multi-search abstraction for computing multiple
breadth-first searches in parallel and demonstrates a
high-performance, general implementation. Our abstraction
removes the burden of orchestrating graph traversal from
the user while providing high performance and low energy
usage, an often overlooked component of algorithm design.
Energy consumption has become a first-class hardware design
constraint for both massive and embedded computing
platforms. Our abstraction can be applied to such problems
as the all-pairs shortest-path problem, community
detection, reachability querying, and others. To map graph
traversal efficiently to NVIDIA GPUs, our hybrid
implementation chooses between processing active vertices
with a single thread or an entire warp based on vertex
outdegree. For a set of twelve varied graphs, the
implementation of our abstraction saves 42\% time and 62\%
energy on average compared to representative
implementations of specific applications from existing
literature.},
file = {material/multi_search_energy.pdf}
}
@InProceedings{ ia-cost,
author = {Joseph N. Wilson and E. Jason Riedy},
ejr-withauthor= {Joseph N. Wilson},
title = {Efficient {SIMD} evaluation of image processing programs},
booktitle = {Parallel and Distributed Methods for Image Processing},
pages = {199--210},
year = {1997},
month = jul,
dom = 28,
editor = {Hongchi Shi and Patrick C. Coffield},
volume = {3166},
address = {San Diego, CA},
organization = {SPIE},
role = {proceedings},
tags = {spie; image algebra; parallel algorithms},
file = {material/ia-cost.pdf},
doi = {10.1117/12.279618},
abstract = {SIMD parallel systems have been employed for image
processing and computer vision applications since their
inception. This paper describes a system in which parallel
programs are implemented using a machine-independent,
retargetable object library that provides SIMD execution on
the Lockheed Martin PAL-I SIMD parallel processor.
Programs' performance on this machine is improved through
on-the-fly execution analysis and scheduling. We describe
the relevant elements of the system structure, the general
scheme for execution analysis, and the current cost model
for scheduling.}
}
@InCollection{ ia-simd-chapter,
file = {material/ia-simd-chap.pdf},
title = {An {Image} {Algebra} Based {SIMD} Image Processing
Environment},
author = {Joseph N. Wilson and E. Jason Riedy and Gerhard X. Ritter
and Hongchi Shi},
booktitle = {Visual Information Representation, Communication, and
Image Processing},
publisher = {Marcel Dekker},
year = {1999},
address = {New York},
editor = {C. W. Chen and Y. Q. Zhang},
pages = {523--542},
abstract = {SIMD parallel computers have been employed for image
related applications since their inception. They have been
leading the way in improving processing speed for those
applications. However, current parallel programming
technologies have not kept pace with the performance growth
and cost decline of parallel hardware. A highly usable
parallel software development environment is needed. This
chapter presents a computing environment that integrates a
SIMD mesh architecture with image algebra for
high-performance image processing applications. The
environment describes parallel programs through a
machine-independent, retargetable image algebra object
library that supports SIMD execution on the Lockheed Martin
PAL-I parallel computer. Program performance on this
machine is improved through on-the-fly execution analysis
and scheduling. We describe the relevant elements of the
system structure, outline the scheme for execution
analysis, and provide examples of the current cost model
and scheduling system.},
optciteseer = {wilson97image.html},
ejr-withauthor= {Joseph N. Wilson and Gerhard X. Ritter and Hongchi Shi},
isbn = {082471928X},
role = {chapter},
tags = {image algebra; parallel algorithms}
}
@InCollection{ icassp2012-stinger,
author = {Jason Riedy and Henning Meyerhenke and David A. Bader and
David Ediger and Timothy G. Mattson},
ejr-withauthor= {Henning Meyerhenke and David Ediger and David A. Bader and
Timothy G. Mattson},
booktitle = {{IEEE} International Conference on Acoustics, Speech and
Signal Processing ({ICASSP})},
title = {Analysis of Streaming Social Networks and Graphs on
Multicore Architectures},
year = {2012},
month = mar,
dom = 29,
address = {Kyoto, Japan},
file = {material/icassp2012.pdf},
slide-url = {http://ur1.ca/i6dz6},
doi = {10.1109/ICASSP.2012.6289126},
opturl = {http://www.slideshare.net/jasonriedy/icassp-2012-analysis-of-streaming-social-networks-and-graphs-on-multicore-architectures},
abstract = {Analyzing static snapshots of massive, graph-structured
data cannot keep pace with the growth of social networks,
financial transactions, and other valuable data sources. We
introduce a framework, STING (Spatio-Temporal Interaction
Networks and Graphs), and evaluate its performance on
multicore, multisocket Intel(R)-based platforms. STING
achieves rates of around 100\,000 edge updates per second
on large, dynamic graphs with a single, general data
structure. We achieve speed-ups of up to 1000$\times$ over
parallel static computation, improve monitoring a dynamic
graph's connected components, and show an exact algorithm
for maintaining local clustering coefficients performs
better on Intel-based platforms than our earlier
approximate algorithm.}
}
@InProceedings{ icpp10,
author = {David Ediger and Karl Jiang and E. Jason Riedy and David
A. Bader and Courtney Corley and Rob Farber and William N.
Reynolds},
ejr-withauthor= {David Ediger and Karl Jiang and David A. Bader and
Courtney Corley and Rob Farber and William N. Reynolds},
title = {Massive Social Network Analysis: Mining Twitter for Social
Good},
booktitle = {39th International Conference on Parallel Processing
({ICPP})},
role = {proceedings},
tags = {parallel; graph},
year = {2010},
address = {San Diego, CA},
month = sep,
dom = 16,
acc-note = {(70/225 papers accepted: 31.1\% acceptance rate)},
doi = {10.1109/ICPP.2010.66},
file = {material/ICPP10-GraphCT.pdf},
abstract = {Social networks produce an enormous quantity of data.
Facebook consists of over 400 million active users sharing
over 5 \emph{billion} pieces of information each month.
Analyzing this vast quantity of unstructured data presents
challenges for software and hardware. We present GraphCT, a
\emph{Graph} \emph{C}haracterization \emph{T}ooklit for
massive graphs representing social network data. On a
128-processor Cray XMT, GraphCT estimates the betweenness
centrality of an artificially generated (R-MAT) 537 million
vertex, 8.6 billion edge graph in 55 minutes. We use
GraphCT to analyze public data from Twitter, a
microblogging network. Twitter's message connections appear
primarily tree-structured as a news dissemination system.
Within the public data, however, are clusters of
conversations. Using GraphCT, we can rank actors within
these conversations and help analysts focus attention on a
much smaller data subset.}
}
@TechReport{ ieee754-2008,
note = {(committee member and contributor)},
key = {IEEE Std 754-2008},
journal = {IEEE Std 754-2008},
type = {IEEE Std},
number = {754-2008},
title = {{IEEE} Standard for Floating-Point Arithmetic},
year = {2008},
pages = {1 -- 70},
institution = {Microprocessor Standards Committee of the IEEE Computer
Society},
abstract = {This standard specifies interchange and arithmetic formats
and methods for binary and decimal floating-point
arithmetic in computer programming environments. This
standard specifies exception conditions and their default
handling. An implementation of a floating-point system
conforming to this standard may be realized entirely in
software, entirely in hardware, or in any combination of
software and hardware. For operations specified in the
normative part of this standard, numerical results and
exceptions are uniquely determined by the values of the
input data, sequence of operations, and destination
formats, all under user control.},
keywords = {IEEE standards;floating point arithmetic;programming;IEEE
standard;arithmetic formats;computer programming;decimal
floating-point
arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
doi = {10.1109/IEEESTD.2008.4610935},
isbn = {978-0-7381-5753-5},
address = {New York, NY},
month = aug,
dom = 29,
committee = {Alex Aiken and Matthew Applegate and David Bailey and
Steve Bass and Dileep Bhandarkar and Mahesh Bhat and David
Bindel and Sylvie Boldo and Stephen Canon and Steven R.
Carlough and Marius Cornea and Mike Cowlishaw (editor) and
John H. Crawford and Joseph D. Darcy and Marc Daumas and
Bob Davis and Mark Davis and Dick Delp and Jim Demmel and
Mark A. Erle and Hossam A. H. Fahmy and J.P. Fasano and
Richard Fateman and Eric Feng and Warren E. Ferguson and
Alex Fit-Florea and Laurent Fournier and Chip Freitag and
Ivan Godard and Roger A. Golliver and David Gustafson and
Michel Hack and John R. Harrison and John Hauser and Yozo
Hida and Chris N. Hinds and Graydon Hoare and David G.
Hough and Jerry Huck and Jim Hull and Michael Ingrassia and
David V. James and Rick James and William Kahan and John
Kapernick and Richard Karpinski and Jeff Kidder and Plamen
Koev and Ren-Cang Li and Zhishun Alex Liu and Raymond Mak
and Peter Markstein and David Matula and Guillaume
Melquiond and Nobuyoshi Mori and Ricardo Morin and Ned
Nedialkov and Craig Nelson and Stuart Oberman and Jon Okada
and Ian Ollmann and Michael Parks and Tom Pittman and Eric
Postpischil and Jason Riedy and Debjit Das Sarma and Eric
M. Schwarz and David Scott and Don Senzig and Ilya Sharapov
and Jim Shearer and Michael Siu and Ron Smith and Chuck
Stevens and Peter Tang and Pamela J. Taylor and James W.
Thomas and Brandon Thompson and Wendy Thrash and Neil Toda
and Son Dao Trong and Leonard Tsai and Charles Tsen and
Fred Tydeman and Liang Kai Wang and Scott Westbrook and
Steve Winkler and Anthony Wood and Umit Yalcinalp and Fred
Zemke and Paul Zimmermann and Dan Zuras (chair)}
}
@Misc{ ieee754-exceptions,
author = {David Bindel and E. Jason Riedy},
ejr-withauthor= {David Bindel},
title = {Exception Handling Interfaces, Implementations, and
Evaluation},
howpublished = {IEEE-754r revision meeting},
month = aug,
year = {2002},
url = {http://grouper.ieee.org/groups/754/meeting-materials/2002-08-22-pres.pdf},
role = {presentation},
tags = {ieee754; floating point},
file = {material/ieee754-2002-08-22-pres.pdf}
}
@Misc{ intel.graph.2011,
file = {material/intel-2011-08-09.pdf},
author = {Jason Riedy and David A. Bader and Henning Meyerhenke and
David Ediger and Timothy Mattson},
ejr-withauthor= {David A. Bader and Henning Meyerhenke and David Ediger and
Timothy Mattson},
title = {{STING}: Spatio-Temporal Interaction Networks and Graphs
for {Intel} Platforms},
howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
dom = 9,
month = aug,
year = 2011,
role = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GT-STING-for-Intel-beamer.pdf}
}
@Misc{ intel.graph.2012,
file = {material/intel-2012-07-12.pdf},
author = {Jason Riedy and David A. Bader and David Ediger and Rob
McColl and Timothy G. Mattson},
ejr-withauthor= {David A. Bader and David Ediger and Rob McColl and Timothy
G. Mattson},
title = {{STING}: Spatio-Temporal Interaction Networks and Graphs
for {Intel} Platforms},
howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
dom = 24,
month = jul,
year = 2012,
role = {presentation},
url = {http://www.slideshare.net/jasonriedy/gt-stingintelslides}
}
@Misc{ intel.graph.2014,
file = {material/intel-2014-01-17.pdf},
author = {Jason Riedy and David A. Bader and David Ediger and Rob
McColl and Timothy G. Mattson},
ejr-withauthor= {David A. Bader and David Ediger and Rob McColl and Timothy
G. Mattson},
title = {{STING}: Spatio-Temporal Interaction Networks and Graphs
for {Intel} Platforms},
howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
dom = 17,
month = jan,
year = 2014,
role = {presentation},
url = {http://www.slideshare.net/jasonriedy/intel-20140117}
}
@Misc{ lang-tools-ieee754,
author = {E. Jason Riedy},
title = {Modern Language Tools and {754R}},
howpublished = {{ARITH}'05},
month = jun,
dom = 28,
year = {2005},
optrole = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf},
file = {material/arith17-slides.pdf},
tags = {ieee754; floating point},
note = {Invited presentation and panelist}
}
@Misc{ lapack-future,
author = {E. Jason Riedy and Yozo Hida and James W. Demmel},
ejr-withauthor= {Yozo Hida and James W. Demmel},
title = {The Future of {LAPACK} and {ScaLAPACK}},
howpublished = {Robert C. Thompson Matrix Meeting},
dom = {18},
month = nov,
year = {2005},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/future-of-scalapack.pdf},
file = {material/future-of-scalapack.pdf},
role = {presentation},
tags = {lapack; software engineering},
abstract = {We are planning new releases of the widely used LAPACK and
ScaLAPACK numerical linear algebra libraries. Based on an
on-going user survey (http://www.netlib.org/lapack-dev) and
research by many people, we are proposing the following
improvements: Faster algorithms (including better numerical
methods, memory hierarchy optimizations, parallelism, and
automatic performance tuning to accomodate new
architectures), more accurate algorithms (including better
numerical methods, and use of extra precision), expanded
functionality (including updating and downdating, new
eigenproblems, etc. and putting more of LAPACK into
ScaLAPACK), and improved ease of use (friendlier interfaces
in multiple languages). To accomplish these goals we are
also relying on better software engineering techniques and
contributions from collaborators at many institutions. This
is joint work with Jack Dongarra.}
}
@InProceedings{ lapack-prospectus,
author = {James W. Demmel and Jack Dongarra and Beresford Parlett
and W. Kahan and Ming Gu and David Bindel and Yozo Hida and
Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and
Christof V{\"o}mel and Julien Langou and Piotr Luszczek and
Jakub Kurzak and Alfredo Buttari and Julie Langou and
Stanimire Tomov},
ejr-withauthor= {James W. Demmel and Jack Dongarra and Beresford Parlett
and W. Kahan and Ming Gu and David Bindel and Yozo Hida and
Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and
Julien Langou and Piotr Luszczek and Jakub Kurzak and
Alfredo Buttari and Julie Langou and Stanimire Tomov},
title = {Prospectus for the Next {LAPACK} and {ScaLAPACK}
Libraries},
booktitle = {{PARA'06}: State-of-the-Art in Scientific and Parallel
Computing},
year = {2006},
address = {Ume{\aa}, Sweden},
month = jun,
organization = {High Performance Computing Center North ({HPC2N}) and the
Department of Computing Science, Ume{\aa} University},
publisher = {Springer},
role = {proceedings},
tags = {lapack},
url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/para06-lapack.pdf},
abstract = {LAPACK and ScaLAPACK are widely used software libraries
for numerical linear algebra. There have been over 68M web
hits at www.netlib.org for the associated libraries LAPACK,
ScaLAPACK, CLAPACK and LAPACK95. LAPACK and ScaLAPACK are
used to solve leading edge science problems and they have
been adopted by many vendors and software providers as the
basis for their own libraries, including AMD, Apple (under
Mac OS X), Cray, Fujitsu, HP, IBM, Intel, NEC, SGI, several
Linux distributions (such as Debian), NAG, IMSL, the
MathWorks (producers of MATLAB), Interactive
Supercomputing, and PGI. Future improvements in these
libraries will therefore have a large impact on users.},
doi = {10.1007/978-3-540-75755-9\_2},
file = {material/lapack-prospectus.pdf}
}
@TechReport{ lapack-prospectus-lawn,
author = {James W. Demmel and Jack Dongarra and Beresford Parlett
and W. Kahan and Ming Gu and David Bindel and Yozo Hida and
Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and
Christof V{\"o}mel and Julien Langou and Piotr Luszczek and
Jakub Kurzak and Alfredo Buttari and Julie Langou and
Stanimire Tomov},
ejr-withauthor= {James W. Demmel and Jack Dongarra and Beresford Parlett
and W. Kahan and Ming Gu and David Bindel and Yozo Hida and
Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and
Julien Langou and Piotr Luszczek and Jakub Kurzak and
Alfredo Buttari and Julie Langou and Stanimire Tomov},
title = {Prospectus for the Next {LAPACK} and {ScaLAPACK}
Libraries},
institution = {Netlib},
year = {2007},
type = {LAPACK Working Note},
number = {181},
lawn = {181},
month = feb,
note = {Also issued as UT-CS-07-592},
role = {techreport},
tags = {lawn; lapack},
url = {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf},
file = {material/lawn181.pdf}
}
@Unpublished{ lapack-style,
author = {Jack Dongarra and Julien Langou and E. Jason Riedy},
ejr-withauthor= {Jack Dongarra and Julien Langou},
title = {Sca/{LAPACK} Program Style},
month = aug,
year = {2006},
role = {unpublished},
tags = {lapack},
url = {http://www.netlib.org/lapack-dev/lapack-coding/program-style.html},
abstract = {The purpose of this document is to facilitate
contributions to LAPACK and ScaLAPACK by documenting their
design and implementation guidelines. The long-term goal is
to provide guidelines for both LAPACK and ScaLAPACK.
However, the parallel ScaLAPACK code has more open issues,
so this document primarily concerns LAPACK.}
}
@TechReport{ lawn188,
author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E.
Jason Riedy},
ejr-withauthor= {James W. Demmel and Yozo Hida and Xiaoye S. Li},
title = {Extra-precise iterative refinement for overdetermined
least squares problems},
type = {LAPACK Working Note},
institution = {Netlib},
year = 2007,
number = 188,
month = may,
dom = 31,
url = {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf},
file = {material/lawn188.pdf},
other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
note = {Also issued as UCB/EECS-2007-77; version accepted for
TOMS.},
abstract = {We present the algorithm, error bounds, and numerical
results for extra-precise iterative refinement applied to
overdetermined linear least squares (LLS) problems. We
apply our linear system refinement algorithm to
Bj{\"o}rck’s augmented linear system formulation of an
LLS problem. Our algorithm reduces the forward normwise and
componentwise errors to $O(\varepsilon)$ unless the system
is too ill conditioned. In contrast to linear systems, we
provide two separate error bounds for the solution $x$ and
the residual $r$. The refinement algorithm requires only
limited use of extra precision and adds only $O(mn)$ work
to the $O(mn^2)$ cost of QR factorization for problems of
size m-by-n. The extra precision calculation is facilitated
by the new extended-precision BLAS standard in a portable
way, and the refinement algorithm will be included in a
future release of LAPACK and can be extended to the other
types of least squares problems.},
role = {techreport},
tags = {lawn; lapack; least squares; floating point}
}
@Article{ lsq-itref-toms,
author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E.
Jason Riedy},
ejr-withauthor= {James W. Demmel and Yozo Hida and Xiaoye S. Li},
title = {Extra-precise iterative refinement for overdetermined
least squares problems},
journal = "{ACM} Transactions on Mathematical Software",
volume = "35",
number = "4",
year = 2009,
month = feb,
issn = {0098-3500},
pages = {1--32},
doi = {10.1145/1462173.1462177},
accepted = "25 June 2008",
role = {refereed},
tags = {acm; toms; lapack; floating point; linear algebra},
abstract = {We present the algorithm, error bounds, and numerical
results for extra-precise iterative refinement applied to
overdetermined linear least squares (LLS) problems. We
apply our linear system refinement algorithm to
Bj{\"o}rck’s augmented linear system formulation of an
LLS problem. Our algorithm reduces the forward normwise and
componentwise errors to $O(\varepsilon)$ unless the system
is too ill conditioned. In contrast to linear systems, we
provide two separate error bounds for the solution $x$ and
the residual $r$. The refinement algorithm requires only
limited use of extra precision and adds only $O(mn)$ work
to the $O(mn^2)$ cost of QR factorization for problems of
size m-by-n. The extra precision calculation is facilitated
by the new extended-precision BLAS standard in a portable
way, and the refinement algorithm will be included in a
future release of LAPACK and can be extended to the other
types of least squares problems.},
gt-role = {Assisted in software development, experimental methods,
writing, and editing.},
file = {material/lsq_itrefx.pdf}
}
@InProceedings{ md11-graph,
author = {David A. Bader and David Ediger and E. Jason Riedy},
ejr-withauthor= {David A. Bader and David Ediger},
title = {Parallel Programming for Graph Analysis},
booktitle = {full day tutorial},
role = {tutorial},
tags = {parallel; graph},
year = 2011,
month = sep,
dom = 28,
address = {Columbia, MD}
}
@Misc{ mng-sdm15,
file = {material/riedy-network-challenge-sdm15.pdf},
author = {Jason Riedy},
title = {Network Challenge: Error and Sensitivity Analysis},
howpublished = {SDM-Networks 2015: The Second SDM Workshop on Mining
Networks and Graphs: A Big Data Analytic Challenge},
dom = 2,
month = may,
year = 2015,
role = {Invited panelist},
address = {Vancouver, BC},
tags = {graph analysis},
url = {http://www.slideshare.net/jasonriedy/network-challenge-error-and-sensitivity-analysis}
}
@InProceedings{ mtaap10,
author = {David Ediger and Karl Jiang and E. Jason Riedy and David
A. Bader},
ejr-withauthor= {David Ediger and Karl Jiang and David A. Bader},
title = {Massive Streaming Data Analytics: A Case Study with
Clustering Coefficients},
booktitle = {4th Workshop on Multithreaded Architectures and
Applications (MTAAP)},
role = {proceedings},
tags = {parallel; graph; streaming},
year = 2010,
address = {Atlanta, GA},
month = apr,
dom = 23,
acc-note = {(11/22 papers accepted, 50\% acceptance rate)},
doi = {10.1109/IPDPSW.2010.5470687},
file = {material/StreamingCC-MTAAP10.pdf},
abstract = {We present a new approach for parallel massive graph
analysis of streaming, temporal data with a dynamic and
extensible representation. Handling the constant stream of
new data from health care, security, business, and social
network applications requires new algorithms and data
structures. We examine data structure and algorithm
trade-offs that extract the parallelism necessary for
high-performance updating analysis of massive graphs.
Static analysis kernels often rely on storing input data in
a specific structure. Maintaining these structures for each
possible kernel with high data rates incurs a significant
performance cost. A case study computing clustering
coefficients on a general-purpose data structure
demonstrates incremental updates can be more efficient than
global recomputation. Within this kernel, we compare three
methods for dynamically updating local clustering
coefficients: a brute-force local recalculation, a sorting
algorithm, and our new approximation method using a Bloom
filter. On 32 processors of a \xmt{} with a synthetic
scale-free graph of $2^{24} \approx 16$ million vertices
and $2^{29} \approx 537$ million edges, the brute-force
method processes a mean of over 50\,000 updates per second
and our Bloom filter approaches 200\,000 updates per second.}
}
@InProceedings{ mtaap11,
author = {David Ediger and E. Jason Riedy and David A. Bader and
Henning Meyerhenke},
ejr-withauthor= {David Ediger and David A. Bader and Henning Meyerhenke},
title = {Tracking Structure of Streaming Social Networks},
booktitle = {5th Workshop on Multithreaded Architectures and
Applications (MTAAP)},
role = {proceedings},
tags = {parallel; graph; streaming},
year = 2011,
month = may,
abstract = {Current online social networks are massive and still
growing. For example, Facebook has over 500 million active
users sharing over 30 billion items per month. The scale
within these data streams has outstripped traditional graph
analysis methods. Monitoring requires dynamic analysis
rather than repeated static analysis. The massive state
behind multiple persistent queries requires shared data
structures and not problem-specific representations. We
present a framework based on the STINGER data structure
that can monitor a global property, connected components,
on a graph of 16 million vertices at rates of up to
240\,000 updates per second on a 32 processor Cray XMT. For
very large scale-free graphs, our implementation uses novel
batching techniques that exploit the scale-free nature of
the data and run over three times faster than prior
methods. Our framework handles, for the first time,
real-world data rates, opening the door to higher-level
analytics such as community and anomaly detection.},
acc-note = {(10/17 papers accepted, 59\% acceptance rate)},
doi = {10.1109/IPDPS.2011.326},
file = {material/TrackingComponents-MTAAP11.pdf}
}
@InProceedings{ mtaap12,
author = {E. Jason Riedy and David A. Bader and Henning Meyerhenke},
ejr-withauthor= {David A. Bader and Henning Meyerhenke},
title = {Scalable Multi-threaded Community Detection in Social
Networks},
booktitle = {6th Workshop on Multithreaded Architectures and
Applications (MTAAP)},
role = {proceedings},
tags = {parallel; graph; community detection},
year = 2012,
month = may,
dom = 25,
abstract = {The volume of existing graph-structured data requires
improved parallel tools and algorithms. Finding
communities, smaller subgraphs densely connected within the
subgraph than to the rest of the graph, plays a role both
in developing new parallel algorithms as well as opening
smaller portions of the data to current analysis tools. We
improve performance of our parallel community detection
algorithm by 20\% on the massively multithreaded Cray XMT,
evaluate its performance on the next-generation Cray XMT2,
and extend its reach to Intel-based platforms with OpenMP.
To our knowledge, not only is this the first massively
parallel community detection algorithm but also the only
such algorithm that achieves excellent performance and good
parallel scalability across all these platforms. Our
implementation analyzes a moderate sized graph with 105
million vertices and 3.3 billion edges in around 500
seconds on a four processor, 80-logical-core Intel-based
system and 1100 seconds on a 64-processor Cray XMT2.},
acc-note = {(9/15 papers accepted, 60\% acceptance)},
slide-url = {http://www.slideshare.net/jasonriedy/mtaap12-scalable-community-detection},
doi = {10.1109/IPDPSW.2012.203},
file = {material/mtaap12.pdf}
}
@InProceedings{ mtaap13,
file = {material/mtaap13-streaming-community-monitoring.pdf},
author = {E. Jason Riedy and David A. Bader},
ejr-withauthor= {David A. Bader},
title = {Multithreaded Community Monitoring for Massive Streaming
Graph Data},
booktitle = {7th Workshop on Multithreaded Architectures and
Applications (MTAAP)},
role = {proceedings},
tags = {parallel; graph; streaming; community detection},
year = 2013,
month = may,
dom = 24,
doi = {10.1109/IPDPSW.2013.229},
address = {Boston, MA},
acc-note = {(11/16 papers accepted, 69\% acceptance)},
abstract = {Analyzing static snapshots of massive, graph-structured
data cannot keep pace with the growth of social networks,
financial transactions, and other valuable data sources.
Current state-of-the-art industrial methods analyze these
streaming sources using only simple, aggregate metrics.
There are few existing scalable algorithms for monitoring
complex global quantities like decomposition into community
structure. Using our framework STING, we present the first
known parallel algorithm specifically for monitoring
communities in this massive, streaming, graph-structured
data. Our algorithm performs incremental re-agglomeration
rather than starting from scratch after each batch of
changes, reducing the problem's size to that of the change
rather than the entire graph. We analyze our initial
implementation's performance on multithreaded platforms for
execution time and latency. On an Intel-based multithreaded
platform, our algorithm handles up to 100 million updates
per second on social networks with one to 30 million edges,
providing a speed-up from 4$\times$ to 3700$\times$ over
statically recomputing the decomposition after each batch
of changes. Possibly because of our artificial graph
generator, resulting communities' modularity varies little
from the initial graph.}
}
@TechReport{ nonneg-house-lawn,
author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida
and E. Jason Riedy},
ejr-withauthor= {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
title = {Non-Negative Diagonals and High Performance on Low-Profile
Matrices from {Householder} {$QR$}},
institution = {Netlib},
year = {2008},
type = {LAPACK Working Note},
number = {203},
lawn = {203},
month = may,
dom = 30,
note = {Also issued as UCB/EECS-2008-76; modified from SISC
version.},
role = {techreport},
tags = {lawn; lapack; householder; qr},
url = {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf},
other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html},
file = {material/lawn203.pdf},
abstract = {The Householder reflections used in LAPACK's $QR$
factorization leave positive and negative real entries
along $R$'s diagonal. This is sufficient for most
applications of $QR$ factorizations, but a few require that
$R$ have a nonnegative diagonal. This note describes a new
Householder generation routine to produce a nonnegative
diagonal. Additionally, we find that scanning for trailing
zeros in the generated reflections leads to large
performance improvements when applying reflections with
many trailing zeros. Factoring low-profile matrices, those
with nonzero entries mostly near the diagonal (e.g., band
matrices), now require far fewer operations. For example,
$QR$ factorization of matrices with profile width b that
are stored densely in an $n\times n$ matrix improves from
$O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK
3.2.}
}
@Article{ nonneg-house-lawn-sisc,
author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida
and E. Jason Riedy},
ejr-withauthor= {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
title = {Non-Negative Diagonals and High Performance on Low-Profile
Matrices from {H}ouseholder {$QR$}},
publisher = {SIAM},
year = 2009,
month = jul,
dom = 3,
journal = {SIAM Journal on Scientific Computing},
volume = 31,
number = 4,
pages = {2832--2841},
keywords = {LAPACK; QR factorization; Householder reflection;
floating-point},
doi = {10.1137/080725763},
role = {refereed},
tags = {siam; sisc; lapack; householder; qr},
issn = {1064-8275},
mrclass = {65F30},
mrnumber = {2520301},
gt-role = {Principal author and developer.},
abstract = {The Householder reflections used in LAPACK's $QR$
factorization leave positive and negative real entries
along $R$'s diagonal. This is sufficient for most
applications of $QR$ factorizations, but a few require that
$R$ have a nonnegative diagonal. This note describes a new
Householder generation routine to produce a nonnegative
diagonal. Additionally, we find that scanning for trailing
zeros in the generated reflections leads to large
performance improvements when applying reflections with
many trailing zeros. Factoring low-profile matrices, those
with nonzero entries mostly near the diagonal (e.g., band
matrices), now require far fewer operations. For example,
$QR$ factorization of matrices with profile width b that
are stored densely in an $n\times n$ matrix improves from
$O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK
3.2.},
file = {material/lawn203.pdf}
}
@Unpublished{ nsf-accel-workshop,
ejr-withauthor= {workshop participants},
editor = {Viktor K. Prasanna and David A. Bader},
key = {Report on NSF Workshop on Center Scale Activities Related
to Accelerators for Data Intensive Applications},
title = {{Report on NSF Workshop on Center Scale Activities Related
to Accelerators for Data Intensive Applications}},
note = {This workshop is supported by NSF Grant Number 1051537, in
response to the Call for Exploratory Workshop Proposals for
Scientific Software Innovation Institutes (S2I2).},
dom = {31},
month = oct,
year = 2010
}
@InProceedings{ nsfaccelws10,
file = {material/nsf-workshop-socnet.pdf},
author = {Jason Riedy and David Bader and David Ediger},
ejr-withauthor= {David Bader and David Ediger},
title = {Applications in Social Networks},
booktitle = {NSF Workshop on Accelerators for Data-Intensive
Applications},
dom = {13},
year = {2010},
month = oct,
role = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/nsf-workshop-socnet.pdf},
tags = {graph; NSF; streaming}
}
@Misc{ par-bipart-pp04,
author = {E. Jason Riedy},
title = {Parallel Weighted Bipartite Matching and Applications},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = {27},
month = feb,
year = {2004},
role = {presentation},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/pp04.pdf},
file = {material/pp04.pdf},
tags = {siam; combinatorial optimization; parallel algorithms;
sparse matrix},
abstract = {Bipartite matching is one of graph theory's workhorses,
occuring in the solution or approximation of many problems.
Increasingly, applications' data spans multiple memory
spaces, but there is little recent experience with
distributed matching algorithms. We present a distributed,
parallel implementation for weighted bipartite matching
based on Bertsekas's auction algorithm. The bidding process
finds local matchings while summarizing updates for
occasional communication, leading to superlinear speed-ups
on some sparse problems and modest performance on others.}
}
@InProceedings{ pmma16-fpaddre,
author = {{Dukhan}, Marat and {Vuduc}, Richard and {Riedy}, Jason},
title = {Wanted: Floating-Point Add Round-off Error Instruction},
booktitle = {The 2nd International Workshop on Performance Modeling:
Methods and Applications ({PMMA16})},
year = 2016,
month = jun,
dom = 23,
address = {Frankfurt, Germany},
note = {(Workshop with ISC High Performance)},
abstract = {We propose a new instruction (FPADDRE) that computes the
round-off error in floating-point addition. We explain how
this instruction benefits high-precision arithmetic
operations in applications where double precision is not
sufficient. Performance estimates on Intel Haswell, Intel
Skylake, and AMD Steamroller processors, as well as Intel
Knights Corner co-processor, demonstrate that such an
instruction would improve the latency of double-double
addition by up to 55\% and increase double-double addition
throughput by up to 103\%, with smaller, but non-negligible
benefits for double-double multiplication. The new
instruction delivers up to 2x speedups on three benchmarks
that use high-precision floating-point arithmetic:
double-double matrix-matrix multiplication, compensated dot
product, and polynomial evaluation via the compensated
Horner scheme.},
url = {http://arxiv.org/abs/1603.00491}
}
@Unpublished{ power-control,
author = {E. Jason Riedy and Robert Szewczyk},
ejr-withauthor= {Robert Szewczyk},
title = {Power and Control in Networked Sensors},
note = {Cited},
month = may,
dom = 11,
year = {2000},
file = {material/power-and-control.pdf},
role = {unpublished},
tags = {sensor network},
abstract = {The fundamental constraint on a networked sensor is its
energy consumption, since it may be either impossible or
not feasible to replace its energy source. We analyze the
power dissipation implications of implementing the network
sensor with either a central processor switching between
I/O devices or a family of processors, each dedicated to a
single device. We present the energy measurements of the
current generations of networked sensors, and develop an
abstract description of tradeoffs between both designs.},
citeseer = {riedy00power.html}
}
@Misc{ pp12-community-ms,
author = {Henning Meyerhenke and E. Jason Riedy and David A. Bader},
ejr-withauthor= {Henning Meyerhenke and David A. Bader},
title = {Parallel Community Detection in Streaming Graphs},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = {15},
month = feb,
year = {2012},
role = {presentation},
tags = {siam; streaming data; parallel algorithms},
address = {Savannah, GA}
}
@Misc{ pp12-graphct,
author = {David Ediger and E. Jason Riedy and Henning Meyerhenke and
David A. Bader},
eir-withauthor= {David Ediger and Henning Meyerhenke and David A. Bader},
title = {Analyzing Massive Networks with GraphCT},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = {16},
month = feb,
year = {2012},
role = {poster},
tags = {siam; parallel algorithms},
address = {Savannah, GA}
}
@Misc{ pp12-sting,
file = {material/siam-pp12-stinger-poster.pdf},
author = {E. Jason Riedy and David Ediger and Henning Meyerhenke and
David A. Bader},
eir-withauthor= {David Ediger and Henning Meyerhenke and David A. Bader},
title = {{STING}: Software for Analysis of Spatio-Temporal
Interaction Networks and Graphs},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = {16},
month = feb,
year = {2012},
role = {poster},
tags = {siam; parallel algorithms},
address = {Savannah, GA},
abstract = {Current tools for analyzing graph-structured data and
semantic networks focus on static graphs. Our STING package
tackles analysis of streaming graphs like today's social
networks and communication tools. STING maintains a massive
graph under changes while coordinating analysis kernels to
achieve analysis at real-world data rates. We show examples
of local metrics like clustering coefficients and global
metrics like connected components and agglomerative
clustering. STING supports parallel Intel architectures as
well as the Cray XMT.}
}
@Misc{ pp12-streaming-ms,
file = {material/siam-pp-2012.pdf},
author = {E. Jason Riedy and Henning Meyerhenke},
ejr-withauthor= {Henning Meyerhenke},
title = {Scalable Algorithms for Analysis of Massive, Streaming
Graphs},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = {15},
month = feb,
year = {2012},
role = {presentation},
url = {http://www.slideshare.net/jasonriedy/siam-pp-2012-scalable-algorithms-for-analysis-of-massive-streaming-graphs},
tags = {siam; streaming data; parallel algorithms},
address = {Savannah, GA},
abstract = {Graph-structured data in social networks, finance, network
security, and others not only are massive but also under
continual change. These changes often are scattered across
the graph. Repeating complex global analyses on massive
snapshots to capture only what has changed is inefficient.
We discuss analysis algorithms for streaming graph data
that maintain both local and global metrics. We extract
parallelism from both analysis kernel and graph data to
scale performance to real-world sizes.}
}
@Misc{ pp16-streaming-ms,
author = {E. Jason Riedy and David A. Bader},
ejr-withauthor= {David A. Bader},
title = {Scalable Network Analysis: Tools, Algorithms,
Applications},
howpublished = {SIAM Parallel Processing for Scientific Computing},
dom = 15,
month = apr,
year = 2016,
role = {presentation},
url = {http://www.slideshare.net/jasonriedy/scalable-and-efficient-algorithms-for-analysis-of-massive-streaming-graphs-60975076},
tags = {siam; streaming data; parallel algorithms},
address = {Paris, France},
abstract = { Graph analysis provides tools for analyzing the irregular
data sets common in health informatics, computational
biology, climate science, sociology, security, finance, and
many other fields. These graphs possess different
structures than typical finite element meshes. Scaling
graph analysis to the scales of data being gathered and
created has spawned many directions of exciting new
research. This minisymposium includes talks on massive
graph generation for testing and evaluating parallel
algorithms, novel streaming techniques, and parallel graph
algorithms for new and existing problems. It also covers
existing parallel frameworks and interdisciplinary
applications, e.g. the analysis of climate networks.}
}
@InProceedings{ ppam11,
author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and
David A. Bader},
ejr-withauthor= {Henning Meyerhenke and David Ediger and David A. Bader},
title = {Parallel Community Detection for Massive Graphs},
booktitle = {9th International Conference on Parallel Processing and
Applied Mathematics (PPAM11)},
year = 2011,
month = sep,
publisher = {Springer},
role = {proceedings},
tags = {parallel; graph; community detection},
abstract = {Tackling the current volume of graph-structured data
requires parallel tools. We extend our work on analyzing
such massive graph data with the first massively parallel
algorithm for community detection that scales to current
data sizes, scaling to graphs of over 122 million vertices
and nearly 2 billion edges in under 7300 seconds on a
massively multithreaded Cray XMT. Our algorithm achieves
moderate parallel scalability without sacrificing
sequential operational complexity. Community detection
partitions a graph into subgraphs more densely connected
within the subgraph than to the rest of the graph. We take
an agglomerative approach similar to Clauset, Newman, and
Moore's sequential algorithm, merging pairs of connected
intermediate subgraphs to optimize different graph
properties. Working in parallel opens new approaches to
high performance. On smaller data sets, we find the
output's modularity compares well with the standard
sequential algorithms.},
acc-note = {(134/243 papers accepted, 55\% acceptance rate)},
doi = {10.1007/978-3-642-31464-3\_29},
file = {material/ppam11-community-detection.pdf}
}
@InProceedings{ ppopp11-graph,
author = {David A. Bader and David Ediger and E. Jason Riedy},
ejr-withauthor= {David A. Bader and David Ediger},
title = {Parallel Programming for Graph Analysis},
booktitle = {16th ACM SIGPLAN Annual Symposium on Principles and
Practice of Parallel Programming (PPoPP)},
role = {tutorial},
tags = {parallel; graph},
year = 2011,
month = feb,
dom = 12,
address = {San Antonio, TX},
url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2011.html},
abstract = {An increasingly fast-paced, digital world has produced an
ever-growing volume of petabyte-sized datasets. At the same
time, terabytes of new, unstructured data arrive daily. As
the desire to ask more detailed questions about these
massive streams has grown, parallel software and hardware
have only recently begun to enable complex analytics in
this non-scientific space.
In this tutorial, we will discuss the open problems facing
us with analyzing this "data deluge". We will present
algorithms and data structures capable of analyzing
spatio-temporal data at massive scale on parallel systems.
We will try to understand the difficulties and bottlenecks
in parallel graph algorithm design on current systems and
will show how multithreaded and hybrid systems can overcome
these challenges. We will demonstrate how parallel graph
algorithms can be implemented on a variety of architectures
using different programming models.
The goal of this tutorial is to provide a comprehensive
introduction to the field of parallel graph analysis to an
audience with computing background, interested in
participating in research and/or commercial applications of
this field. Moreover, we will cover leading-edge technical
and algorithmic developments in the field and discuss open
problems and potential solutions.}
}
@InProceedings{ ppopp12-graph,
author = {David Ediger and Jason Riedy and Rob McColl and David A.
Bader},
ejr-withauthor= {David Ediger and Rob McColl and David A. Bader},
title = {Parallel Programming for Graph Analysis},
booktitle = {17th ACM SIGPLAN Annual Symposium on Principles and
Practice of Parallel Programming (PPoPP)},
role = {tutorial},
tags = {parallel; graph},
year = 2012,
month = feb,
dom = 26,
address = {New Orleans, LA},
url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2012.html},
abstract = {An increasingly fast-paced, digital world has produced an
ever-growing volume of petabyte-sized datasets. At the same
time, terabytes of new, unstructured data arrive daily. As
the desire to ask more detailed questions about these
massive streams has grown, parallel software and hardware
have only recently begun to enable complex analytics in
this non-scientific space.
In this tutorial, we will discuss the open problems facing
us with analyzing this "data deluge". We will present
algorithms and data structures capable of analyzing
spatio-temporal data at massive scale on parallel systems.
We will try to understand the difficulties and bottlenecks
in parallel graph algorithm design on current systems and
will show how multithreaded and hybrid systems can overcome
these challenges. We will demonstrate how parallel graph
algorithms can be implemented on a variety of architectures
using different programming models.
The goal of this tutorial is to provide a comprehensive
introduction to the field of parallel graph analysis to an
audience with computing background, interested in
participating in research and/or commercial applications of
this field. Moreover, we will cover leading-edge technical
and algorithmic developments in the field and discuss open
problems and potential solutions.}
}
@InCollection{ rwp10,
author = {E. Jason Riedy},
editor = {Dana Martin Guthrie},
booktitle = {Read Write Poem NaPoWriMo Anthology},
title = {Here, on the farthest point of the peninsula},
publisher = {\href{http://issuu.com}{issuu.com}},
year = {2010},
month = sep,
dom = {15},
pages = {86},
url = {http://issuu.com/readwritepoem/docs/read_write_poem_napowrimo_anthology},
myurl = {http://lovesgoodfood.com/jason/posts/post-0099/},
role = {poetry},
tags = {beach; napowrimo; poetry; rwp}
}
@Unpublished{ s2i2-acmbcb-2013,
author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and
Manish Parashar and David Bader and Jason Riedy and Richard
Vuduc},
title = {Report on ``Workshop on Challenges in accelerating
Next-Gen Sequencing ({NGS}) bioinformatics''},
address = {Washington, DC},
dom = 25,
month = sep,
year = 2013,
note = {in conjunction with ACM-BCB 2013},
url = {http://future-compute.usc.edu/index.php/NGS_Bioinformatics_Workshop}
}
@Unpublished{ s2i2-ipdps-2013,
author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and
Manish Parashar and David Bader and Jason Riedy and Richard
Vuduc},
title = {Report on ``Workshop on Accelerating Bioinformatics
Applications Enabled by NextGen-Sequencing''},
address = {Boston, MA},
dom = 19,
month = may,
year = 2013,
note = {Co-located with IPDPS 2013},
url = {http://future-compute.usc.edu/index.php/NGS_Workshop}
}
@TechReport{ seed-set-tr,
author = {Riedy, Jason and Bader, David A. and Jiang, Karl and
Pande, Pushkar and Sharma, Richa},
ejr-withauthor= {Bader, David A. and Jiang, Karl and Pande, Pushkar and
Sharma, Richa},
title = {Detecting Communities from Given Seeds in Social
Networks},
institution = {Georgia Institute of Technology},
year = 2011,
number = {GT-CSE-11-01},
month = feb,
dom = 22,
file = {material/GT-CSE-11-01.pdf},
url = {http://hdl.handle.net/1853/36980},
role = {techreport},
abstract = {Analyzing massive social networks challenges both
high-performance computers and human understanding. These
massive networks cannot be visualized easily, and their
scale makes applying complex analysis methods
computationally expensive. We present a region-growing
method for finding a smaller, more tractable subgraph, a
community, given a few example seed vertices. Unlike
existing work, we focus on a small number of seed vertices,
from two to a few dozen. We also present the first
comparison between five algorithms for expanding a small
seed set into a community. Our comparison applies these
algorithms to an R-MAT generated graph component with 240
thousand vertices and 32 million edges and evaluates the
community size, modularity, Kullback-Leibler divergence,
conductance, and clustering coefficient. We find that our
new algorithm with a local modularity maximizing heuristic
based on Clauset, Newman, and Moore performs very well when
the output is limited to 100 or 1000 vertices. When run
without a vertex size limit, a heuristic from McCloskey and
Bader generates communities containing around 60\% of the
graph's vertices and having a small conductance and
modularity appropriate to the result size. A personalized
PageRank algorithm based on Andersen, Lang, and Chung also
performs well with respect to our metrics.},
tags = {graph; social network}
}
@Misc{ siam-am03,
author = {E. Jason Riedy},
title = {Practical Alternatives for Parallel Pivoting},
howpublished = {SIAM Annual Meeting},
month = jun,
year = {2003},
role = {presentation},
tags = {siam; sparse matrix; linear algebra},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-am03.pdf},
file = {material/siam-am03.pdf},
abstract = {Traditional pivoting during parallel, unsymmetric $LU$
factorization introduces heavy communication and
restructuring costs. Possible alternatives include
pre-pivoting to place heavy elements along the diagonal and
limited pivoting that maintains the factors' structures.
Each alternative comes with trade-offs that affect accuracy
and performance.}
}
@Misc{ siam-cse03,
author = {E. Jason Riedy},
title = {Parallel Bipartite Matching for Sparse Matrix
Computations},
howpublished = {SIAM Conference on Computational Science and Engineering},
month = feb,
year = {2003},
role = {poster},
tags = {siam; parallel algorithms; combinatorial optimization;
sparse matrix},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-cse03-poster.pdf},
file = {material/siam-cse03-poster.pdf},
abstract = {Practical and efficient methods exist for parallelizing
the numerical work in sparse matrix calculations. The
initial symbolic analysis is now becoming a sequential
bottleneck, limiting problems' sizes. One such analysis is
the weighted bipartite matching used to achieve scalable,
unsymmetric $LU$ factorization in Super\textsc{lu}.
Applying a mathematical optimization algorithm produces a
distributed-memory implementation with explicit trade-offs
between speed and matching quality. We present accuracy and
performance results for this phase alone and in the context
of Super\textsc{lu}.}
}
@Misc{ siamcse13-largescalegraph,
author = {David A. Bader and Henning Meyerhenke and Jason Riedy},
ejr-withauthor= {David A. Bader and Henning Meyerhenke},
title = {Applications and Challenges in Large-scale Graph
Analysis},
howpublished = {SIAM Conference on Computational Science and Engineering},
month = feb,
year = 2013,
address = {Boston, MA},
role = {presentation},
tags = {siam; parallel algorithms},
url = {http://www.graphanalysis.org/SIAM-CSE13/01_Bader.pdf},
abstract = {Emerging real-world graph problems include detecting
community structure in large social networks, improving the
resilience of the electric power grid, and detecting and
preventing disease in human populations. We discuss the
opportunities and challenges in massive data-intensive
computing for applications in social network analysis,
genomics, and security. The explosion of real-world graph
data poses substantial challenges for software, hardware,
algorithms, and application experts.}
}
@Misc{ siamcse13-streaminggraph,
file = {material/cse2013-streaming.pdf},
author = {Robert C. McColl and David Ediger and David A. Bader and
Jason Riedy},
ejr-withauthor= {Robert C. McColl and David Ediger and David A. Bader},
title = {Analyzing Graph Structure in Streaming Data with
{STINGER}},
howpublished = {SIAM Conference on Computational Science and Engineering},
month = feb,
year = 2013,
address = {Boston, MA},
role = {presentation},
tags = {siam; streaming data; parallel algorithms},
abstract = {Analyzing static snapshots of massive, graph-structured
data cannot keep pace with the growth of social networks,
financial transactions, and other valuable data sources.
Our software framework, STING (Spatio-Temporal Interaction
Networks and Graphs), uses a scalable, high-performance
graph data structure to enable these applications. STING
supports fast insertions, deletions, and updates on graphs
with semantic information and skewed degree distributions.
STING achieves large speed-ups over parallel, static
recomputation on both common multicore and specialized
multithreaded platforms.}
}
@InProceedings{ siamns17-ejr,
author = {E. Jason Riedy and Chunxing Yin and David A. Bader},
title = {A New Algorithm Model for Massive-Scale Streaming Graph
Analysiso},
booktitle = {SIAM Workshop on Network Science},
year = 2017,
month = jul,
dom = 14,
address = {Pittsburgh, PA},
url = {https://www.slideshare.net/jasonriedy/a-new-algorithm-model-for-massivescale-streaming-graph-analysis}
}
@InCollection{ smallstone10,
author = {Jason Riedy},
editor = {Fiona Robyn and Kaspalita},
booktitle = {pay attention: a river of stones},
title = {The storm's coming when the chickens spread out},
publisher = {\href{http://lulu.com}{lulu.com}},
year = {2011},
month = mar,
dom = {2},
pages = 77,
myurl = {http://lovesgoodfood.com/jason/posts/river-of-stones-7/},
url = {http://www.lulu.com/product/file-download/pay-attention-a-river-of-stones/15057101},
role = {poetry},
tags = {poetry; aros; riverofstones}
}
@Misc{ sparse-ds-csc04,
author = {E. Jason Riedy},
title = {Sparse Data Structures for Weighted Bipartite Matching},
howpublished = {SIAM Workshop on Combinatorial Scientific Computing},
dom = {28},
month = feb,
year = {2004},
role = {presentation},
tags = {siam; combinatorial optimization; sparse matrix},
url = {http://purl.oclc.org/NET/jason-riedy/resume/material/csc04.pdf},
file = {material/csc04.pdf}
}
@InProceedings{ stinger-gabb2014,
file = {material/gabb-2014-pres.pdf},
author = {Jason Riedy and David A. Bader},
title = {{STINGER}: Multi-threaded Graph Streaming},
dom = 19,
booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB} 2014)},
year = 2014,
month = may,
address = {Phoeniz, AZ},
note = {Invited presentation and panelist. (Workshop with IPDPS
2014)},
optrole = {presentation},
tags = {parallel; graph; streaming},
url = {http://www.slideshare.net/jasonriedy/stinger-multithreaded-graph-streaming}
}
@InProceedings{ stinger-hpec12,
author = {David Ediger and Robert McColl and Jason Riedy and David
A. Bader},
ejr-withauthor= {David Ediger and Robert McColl and David A. Bader},
title = {{STINGER}: High Performance Data Structure for Streaming
Graphs},
tags = {parallel; graph; streaming},
booktitle = {The IEEE High Performance Extreme Computing Conference
(HPEC)},
year = 2012,
month = sep,
address = {Waltham, MA},
note = {Best paper award finalist},
dom = 12,
role = {proceedings},
doi = {10.1109/HPEC.2012.6408680},
file = {material/hpec12-stinger.pdf},
abstract = {The current research focus on ``big data'' problems
highlights the scale and complexity of analytics required
and the high rate at which data may be changing. In this
paper, we present our high performance, scalable and
portable software, Spatio-Temporal Interaction Networks and
Graphs Extensible Representation (STINGER), that includes a
graph data structure that enables these applications. Key
attributes of STINGER are fast insertions, deletions, and
updates on semantic graphs with skewed degree
distributions. We demonstrate a process of algorithmic and
architectural optimizations that enable high performance on
the Cray XMT family and Intel multicore servers. Our
implementation of STINGER on the Cray XMT processes over 3
million updates per second on a scale-free graph with 537
million edges.}
}
@Unpublished{ tera-ubench,
author = {E. Jason Riedy and Rich Vuduc},
ejr-withauthor= {Rich Vuduc},
file = {material/Tera.pdf},
title = {Microbenchmarking the {Tera} {MTA}},
note = {Cited},
other-url = {http://purl.oclc.org/NET/jason-riedy/resume/material/Tera-presentation.pdf},
dom = 21,
month = may,
year = {1999},
abstract = {The Tera Multithreaded Architecture, or MTA, addresses
scalable shared memory system design with a difierent
approach; it tolerates latency through providing fast
access to multiple threads of execution. The MTA employs a
number of radical design ideas: creation of hardware
threads (streams) with frequent context switching;
full-empty bits for each memory word; a flat memory
hierarchy; and deep pipelines. Recent evaluations of the
MTA have taken a top-down approach: port applications and
application benchmarks, and compare the absolute
performance with conventional systems. While useful, these
studies do not reveal the effect of the Tera MTA's unique
hardware features on an application. We present a bottom-up
approach to the evaluation of the MTA via a suite of
microbenchmarks to examine in detail the underlying
hardware mechanisms and the cost of runtime system support
for multithreading. In particular, we measure memory,
network, and instruction latencies; memory bandwidth; the
cost of low-level synchronization via full-empty bits;
overhead for stream management; and the effects of software
pipelining. These data should provide a foundation for
performance modeling on the MTA. We also present results
for list ranking on the MTA, an application which has
traditionally been difficult to scale on conventional
parallel systems.},
role = {unpublished},
tags = {parallel programming; parallel algorithms; multithreaded;
computer architecture; cray}
}
@TechReport{ tridiag-lawn,
author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
ejr-withauthor= {Osni A. Marques and Christof V{\"o}mel},
title = {Benefits of {IEEE-754} Features in Modern Symmetric
Tridiagonal Eigensolvers},
type = {LAPACK Working Note},
number = {172},
lawn = {172},
institution = {Netlib},
month = sep,
year = {2005},
dom = 30,
note = {Also issued as UCB//CSD-05-1414; expanded from SISC
version},
url = {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf},
other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html},
role = {techreport},
tags = {lawn; lapack; floating point; ieee754; eigenvalue},
file = {material/lawn172.pdf},
abstract = {Bisection is one of the most common methods used to
compute the eigenvalues of symmetric tridiagonal matrices.
Bisection relies on the Sturm count: For a given shift
sigma, the number of negative pivots in the factorization
$T - \sigma I = LDL^T$ equals the number of eigenvalues of
T that are smaller than sigma. In IEEE-754 arithmetic, the
value $\infty$ permits the computation to continue past a
zero pivot, producing a correct Sturm count when $T$ is
unreduced. Demmel and Li showed [IEEE Trans. Comput., 43
(1994), pp. 983–992] that using $\infty$ rather than
testing for zero pivots within the loop could significantly
improve performance on certain architectures. When
eigenvalues are to be computed to high relative accuracy,
it is often preferable to work with $LDL^T$ factorizations
instead of the original tridiagonal $T$. One important
example is the MRRR algorithm. When bisection is applied to
the factored matrix, the Sturm count is computed from
$LDL^T$ which makes differential stationary and progressive
qds algorithms the methods of choice. While it seems
trivial to replace $T$ by $LDL^T$, in reality these
algorithms are more complicated: In IEEE-754 arithmetic, a
zero pivot produces an overflow followed by an invalid
exception (NaN, or ``Not a Number'') that renders the Sturm
count incorrect. We present alternative, safe formulations
that are guaranteed to produce the correct result.
Benchmarking these algorithms on a variety of platforms
shows that the original formulation without tests is always
faster provided that no exception occurs. The transforms
see speed-ups of up to 2.6x over the careful formulations.
Tests on industrial matrices show that encountering
exceptions in practice is rare. This leads to the following
design: First, compute the Sturm count by the fast but
unsafe algorithm. Then, if an exception occurs, recompute
the count by a safe, slower alternative. The new Sturm
count algorithms improve the speed of bisection by up to 2x
on our test matrices. Furthermore, unlike the traditional
tiny-pivot substitution, proper use of IEEE-754 features
provides a careful formulation that imposes no input range
restrictions.}
}
@Article{ tridiag-sisc,
author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
ejr-withauthor= {Osni A. Marques and Christof V{\"o}mel},
title = {Benefits of {IEEE-754} Features in Modern Symmetric
Tridiagonal Eigensolvers},
journal = {SIAM Journal on Scientific Computing},
year = {2006},
month = sep,
dom = 28,
volume = {28},
number = {5},
pages = {1613--1633},
role = {refereed},
tags = {siam; sisc; floating point; eigenvalue; ieee754},
doi = {10.1137/050641624},
issn = {1064-8275},
mrclass = {65F15},
mrnumber = {2272181},
abstract = {Bisection is one of the most common methods used to
compute the eigenvalues of symmetric tridiagonal matrices.
Bisection relies on the Sturm count: For a given shift
sigma, the number of negative pivots in the factorization
$T - \sigma I = LDL^T$ equals the number of eigenvalues of
T that are smaller than sigma. In IEEE-754 arithmetic, the
value $\infty$ permits the computation to continue past a
zero pivot, producing a correct Sturm count when $T$ is
unreduced. Demmel and Li showed [IEEE Trans. Comput., 43
(1994), pp. 983–992] that using $\infty$ rather than
testing for zero pivots within the loop could significantly
improve performance on certain architectures. When
eigenvalues are to be computed to high relative accuracy,
it is often preferable to work with $LDL^T$ factorizations
instead of the original tridiagonal $T$. One important
example is the MRRR algorithm. When bisection is applied to
the factored matrix, the Sturm count is computed from
$LDL^T$ which makes differential stationary and progressive
qds algorithms the methods of choice. While it seems
trivial to replace $T$ by $LDL^T$, in reality these
algorithms are more complicated: In IEEE-754 arithmetic, a
zero pivot produces an overflow followed by an invalid
exception (NaN, or ``Not a Number'') that renders the Sturm
count incorrect. We present alternative, safe formulations
that are guaranteed to produce the correct result.
Benchmarking these algorithms on a variety of platforms
shows that the original formulation without tests is always
faster provided that no exception occurs. The transforms
see speed-ups of up to 2.6x over the careful formulations.
Tests on industrial matrices show that encountering
exceptions in practice is rare. This leads to the following
design: First, compute the Sturm count by the fast but
unsafe algorithm. Then, if an exception occurs, recompute
the count by a safe, slower alternative. The new Sturm
count algorithms improve the speed of bisection by up to 2x
on our test matrices. Furthermore, unlike the traditional
tiny-pivot substitution, proper use of IEEE-754 features
provides a careful formulation that imposes no input range
restrictions.},
gt-role = {Principal author and developer.},
file = {material/ieee754-tridiag.pdf}
}
@InProceedings{ wssspe1,
author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and
Manish Parashar and Jason Riedy and David Bader and Richard
Vuduc},
ejr-withauthor= {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and
Manish Parashar and David Bader and Richard Vuduc},
title = {Sustainable Software Development for Next-Gen Sequencing
(NGS) Bioinformatics on Emerging Platforms},
booktitle = {First Workshop on Sustainable Software for Science:
Practice and Experiences (WSSSPE1)},
year = 2013,
month = nov,
dom = 17,
address = {Denver, CO},
note = {held in conjunction with SC13, published electronically
(\url{http://wssspe.researchcomputing.org.uk/})},
url = {http://arxiv.org/abs/1309.1828},
file = {material/wssspe13.pdf},
abstract = {DNA sequence analysis is fundamental to life science
research. The rapid development of next generation
sequencing (NGS) technologies, and the richness and
diversity of applications it makes feasible, have created
an enormous gulf between the potential of this technology
and the development of computational methods to realize
this potential. Bridging this gap holds possibilities for
broad impacts toward multiple grand challenges and offers
unprecedented opportunities for software innovation and
research. We argue that NGS-enabled applications need a
critical mass of sustainable software to benefit from
emerging computing platforms' transformative potential.
Accumulating the necessary critical mass will require
leaders in computational biology, bioinformatics, computer
science, and computer engineering work together to identify
core opportunity areas, critical software infrastructure,
and software sustainability challenges. Furthermore, due to
the quickly changing nature of both bioinformatics software
and accelerator technology, we conclude that creating
sustainable accelerated bioinformatics software means
constructing a sustainable bridge between the two fields.
In particular, sustained collaboration between domain
developers and technology experts is needed to develop the
accelerated kernels, libraries, frameworks and middleware
that could provide the needed flexible link from NGS
bioinformatics applications to emerging platforms.}
}