@STRING{jan = {January}}
@STRING{feb = "February"}
@STRING{mar = "March"}
@STRING{apr = "April"}
@STRING{may = "May"}
@STRING{jun = "June"}
@STRING{jul = "July"}
@STRING{aug = "August"}
@STRING{sep = "September"}
@STRING{oct = "October"}
@STRING{nov = "November"}
@STRING{dec = "December"}

@article{10.1145/3418077,
  author =       {Hein, Eric R. and Eswar, Srinivas and Ya\c{s}ar,
                  Abdurrahman and Li, Jiajia and Young, Jeffrey S. and
                  Conte, Thomas M. and \c{C}ataly\"{u}rek, \"{U}mit
                  V. and Vuduc, Richard and Riedy, Jason and U\c{c}ar,
                  Bora},
  title =        {Programming Strategies for Irregular Algorithms on
                  the Emu Chick},
  year =         {2020},
  issue_date =   {October 2020},
  publisher =    {Association for Computing Machinery},
  address =      {New York, NY, USA},
  volume =       {7},
  number =       {4},
  issn =         {2329-4949},
  OPTurl =          {https://doi.org/10.1145/3418077},
  doi =          {10.1145/3418077},
  abstract =     {The Emu Chick prototype implements migratory
                  memory-side processing in a novel hardware
                  system. Rather than transferring large amounts of
                  data across the system interconnect, the Emu Chick
                  moves lightweight thread contexts to near-memory
                  cores before the beginning of each remote memory
                  read. Previous work has characterized the
                  performance of the Chick prototype in terms of
                  memory bandwidth and programming differences from
                  more typical, non-migratory platforms, but there has
                  not yet been an analysis of algorithms on this
                  system.This work evaluates irregular algorithms that
                  could benefit from the lightweight, memory-side
                  processing of the Chick and demonstrates techniques
                  and optimization strategies for achieving
                  performance in sparse matrix-vector multiply
                  operation (SpMV), breadth-first search (BFS), and
                  graph alignment across up to eight distributed nodes
                  encompassing 64 nodelets in the Chick system. We
                  also define and justify relative metrics to compare
                  prototype FPGA-based hardware with established ASIC
                  architectures. The Chick currently supports up to
                  68x scaling for graph alignment, 80 MTEPS for BFS on
                  balanced graphs, and 50\% of measured STREAM
                  bandwidth for SpMV.},
  journal =      {ACM Trans. Parallel Comput.},
  month =        oct,
  articleno =    {25},
  numpages =     {25},
  keywords =     {EMU architecture}
}

@Article{parco19-emu,
  author =       {Jeffrey Young and Eric Hein and Srinivas Eswar and
                  Patrick Lavin and Jiajia Li and Jason Riedy and
                  Richard Vuduc and Thomas M. Conte},
  title =        {A Microbenchmark Characterization of the {Emu}
                  {Chick}},
  month =        sep,
  year =         2019,
  doi =          {10.1016/j.parco.2019.04.012},
  journal =      {Parallel Computing},
  abstract =     {The Emu Chick is a prototype system designed around
                  the concept of migratory memory-side
                  processing. Rather than transferring large amounts
                  of data across power-hungry, high-latency
                  interconnects, the Emu Chick moves lightweight
                  thread contexts to near-memory cores before the
                  beginning of each memory read. The current prototype
                  hardware uses FPGAs to implement cache-less
                  ``Gossamer'' cores for doing computational work and
                  a stationary core to run basic operating system
                  functions and migrate threads between nodes. In this
                  multi-node characterization of the Emu Chick, we
                  extend an earlier single-node investigation of the
                  the memory bandwidth characteristics of the system
                  through benchmarks like STREAM, pointer chasing, and
                  sparse matrix-vector multiplication. We compare the
                  Emu Chick hardware to architectural simulation and
                  an Intel Xeon-based platform. Our results
                  demonstrate that for many basic operations the Emu
                  Chick can use available memory bandwidth more
                  efficiently than a more traditional, cache-based
                  architecture although bandwidth usage suffers for
                  computationally intensive workloads like
                  SpMV. Moreover, the Emu Chick provides stable,
                  predictable performance with up to 65\% of the peak
                  bandwidth utilization on a random-access pointer
                  chasing benchmark with weak locality.},
}

@Article{localcomm-2017,
  author =       {Eisha Nathan and Anita Zakrzewska and Jason Riedy and David A. Bader},
  title =        {Local Community Detection in Dynamic Graphs Using Personalized Centrality},
  journal =      {Algorithms},
  year =         2017,
  month =        {August},
  abstract =     {Analyzing massive graphs poses challenges due to the vast amount of data available.  Extracting smaller relevant subgraphs allows for further visualization and analysis that would otherwise be too computationally intensive.  Furthermore, many real data sets are constantly changing, and require algorithms to update as the graph evolves.  This work addresses the topic of local community detection, or seed set expansion, using personalized centrality measures, specifically PageRank and Katz centrality.  We present a method to efficiently update local communities in dynamic graphs. By updating the personalized ranking vectors, we can incrementally update the corresponding local community.  Applying our methods on real-world graphs, we are able to obtain speedups of up to 60$\times$ compared to static recomputation while maintaining an average recall of 0.94 of the highly ranked vertices returned.  Next, we investigate how approximations of a centrality vector affect the resulting local community. Specifically, our method that guarantees that the vertices returned in the community are the highly ranked vertices from a personalized centrality metric.},
  ISSN =         {1999-4893},
  doi =          {10.3390/a10030102},
  volume =       10,
  number =       3,
  article_number =102,
  projtag =      {grateful, hpda},
  ejr-proj = {high-performance-data-analysis, graph-analysis},
  ejr-grant = {hpda, grateful, xscala},
}

@Article{graphct-tpds-2012,
  author =       {David Ediger and Karl Jiang and Jason Riedy and David A. Bader},
  ejr-withauthor ={David Ediger and Karl Jiang and David A. Bader},
  title =        {{GraphCT}: Multithreaded Algorithms for Massive Graph Analysis},
  journal =      {{IEEE} Transactions in Parallel and Distributed Systems},
  year =         2013,
  pages =        {2220 -- 2229},
  month =        {September},
  doi =          {10.1109/TPDS.2012.323},
  ISSN =         {1045-9219},
  url =          {http://dx.doi.org/10.1109/TPDS.2012.323},
  role =         {refereed},
  gt-role =      {Assisted in software development, experimental methods, writing, and editing.},
  abstract =     {The digital world has given rise to massive quantities of data that include rich semantic and complex networks. A social graph, for example, containing hundreds of millions of actors and tens of billions of relationships is not uncommon. Analyzing these large data sets, even to answer simple analytic queries, often pushes the limits of algorithms and machine architectures. We present GraphCT, a scalable framework for graph analysis using parallel and multithreaded algorithms on shared memory platforms. Utilizing the unique characteristics of the Cray XMT, GraphCT enables fast network analysis at unprecedented scales on a variety of input data sets. On a synthetic power law graph with 2 billion vertices and 17 billion edges, we can find the connected components in 2 minutes. We can estimate the betweenness centrality of a similar graph with 537 million vertices and over 8 billion edges in under 1 hour. GraphCT is built for portability and performance.},
  file =         {material/GraphCT-IEEE.pdf},
  projtag =      {cassmt},
  ejr-proj =     {high-performance-data-analysis, graph-analysis},
  ejr-grant =    {cassmt},
}

@Article{nonneg-house-lawn-sisc,
  author =       {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
  title =        {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {H}ouseholder {$QR$}},
  publisher =    {SIAM},
  year =         2009,
  month =        {July},
  dom =          3,
  journal =      {SIAM Journal on Scientific Computing},
  volume =       31,
  number =       4,
  pages =        {2832--2841},
  keywords =     {LAPACK; QR factorization; Householder reflection; floating-point},
  doi =          {10.1137/080725763},
  role =         {refereed},
  OPTtags =         {siam; sisc; lapack; householder; qr},
  ISSN =         {1064-8275},
  MRCLASS =      {65F30},
  MRNUMBER =     2520301,
  gt-role =      {Principal author and developer.},
  abstract =     {The Householder reflections used in LAPACK's $QR$ factorization leave positive and negative real entries along $R$'s diagonal. This is sufficient for most applications of $QR$ factorizations, but a few require that $R$ have a nonnegative diagonal. This note describes a new Householder generation routine to produce a nonnegative diagonal. Additionally, we find that scanning for trailing zeros in the generated reflections leads to large performance improvements when applying reflections with many trailing zeros. Factoring low-profile matrices, those with nonzero entries mostly near the diagonal (e.g., band matrices), now require far fewer operations. For example, $QR$ factorization of matrices with profile width b that are stored densely in an $n\times n$ matrix improves from $O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK 3.2.},
  file =         {material/lawn203.pdf},
  projtag =      {lapack},
  keywords = {lapack, linear algebra},
  ejr-proj = {linear-algebra}
}

@Article{lsq-itref-toms,
  author =       {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title =        {Extra-precise iterative refinement for overdetermined least squares problems},
  journal =      "{ACM} Transactions on Mathematical Software",
  volume =       35,
  number =       4,
  year =         2009,
  month =        {February},
  issn =         {0098-3500},
  pages =        {1--32},
  doi =          {10.1145/1462173.1462177},
  accepted =     "25 June 2008",
  role =         {refereed},
  OPTtags =         {acm; toms; lapack; floating point; linear algebra},
  abstract =     {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.},
  gt-role =      {Assisted in software development, experimental methods, writing, and editing.},
  file =         {material/lsq_itrefx.pdf},
  projtag =      {lapack, ieee754},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}

@Article{tridiag-sisc,
  author =       {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
  ejr-withauthor ={Osni A. Marques and Christof V{\"o}mel},
  title =        {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers},
  journal =      {SIAM Journal on Scientific Computing},
  year =         2006,
  month =        {September},
  dom =          28,
  volume =       28,
  number =       5,
  pages =        {1613--1633},
  role =         {refereed},
  OPTtags =         {siam; sisc; floating point; eigenvalue; ieee754},
  doi =          {10.1137/050641624},
  ISSN =         {1064-8275},
  MRCLASS =      {65F15},
  MRNUMBER =     2272181,
  abstract =     {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.},
  gt-role =      {Principal author and developer.},
  file =         {material/ieee754-tridiag.pdf},
  projtag =      {lapack},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}

@Article{axb-itref-toms,
  author =       {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee},
  title =        {Error bounds from extra-precise iterative refinement},
  journal =      {{ACM} Transactions on Mathematical Software},
  year =         2006,
  volume =       32,
  number =       2,
  pages =        {325--351},
  month =        {June},
  role =         {refereed},
  OPTtags =         {acm; toms; lapack; floating point; linear algebra},
  doi =          {10.1145/1141885.1141894},
  ISSN =         {0098-3500},
  MRCLASS =      {65F10},
  MRNUMBER =     2272365,
  abstract =     {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations where the residual is computed with extra precision. This algorithm was originally proposed in 1948 and analyzed in the 1960s as a means to compute very accurate solutions to all but the most ill-conditioned linear systems. However, two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard has essentially removed the first obstacle. To overcome the second obstacle, we show how the application of iterative refinement can be used to compute an error bound in any norm at small cost and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound.},
  gt-role =      {Assisted in software development, experimental methods, writing, and editing.},
  file =         {material/gesvxx.pdf},
  projtag =      {lapack, ieee754},
  keywords = {lapack, ieee754, floating point, linear algebra},
  ejr-proj = {linear-algebra, floating-point}
}