@STRING{jan = "January"}
@STRING{feb = "February"}
@STRING{mar = "March"}
@STRING{apr = "April"}
@STRING{may = "May"}
@STRING{jun = "June"}
@STRING{jul = "July"}
@STRING{aug = "August"}
@STRING{sep = "September"}
@STRING{oct = "October"}
@STRING{nov = "November"}
@STRING{dec = "December"}

@misc{abdelfattah2024interfacesparselinearalgebra,
  title =        {Interface for Sparse Linear Algebra Operations},
  author =       {Ahmad Abdelfattah and Willow Ahrens and Hartwig Anzt and Chris
                  Armstrong and Ben Brock and Aydin Buluc and Federico Busato
                  and Terry Cojean and Tim Davis and Jim Demmel and Grace Dinh
                  and David Gardener and Jan Fiala and Mark Gates and Azzam
                  Haider and Toshiyuki Imamura and Pedro Valero Lara and Jose
                  Moreira and Sherry Li and Piotr Luszczek and Max Melichenko
                  and Jose Moeira and Yvan Mokwinski and Riley Murray and
                  Spencer Patty and Slaven Peles and Tobias Ribizel and Jason
                  Riedy and Siva Rajamanickam and Piyush Sao and Manu Shantharam
                  and Keita Teranishi and Stan Tomov and Yu-Hsiang Tsai and
                  Heiko Weichelt},
  year =         2024,
  eprint =       {2411.13259},
  archivePrefix ={arXiv},
  primaryClass = {cs.MS},
  url =          {https://arxiv.org/abs/2411.13259},
  abstract =     {The standardization of an interface for dense linear algebra
                  operations in the BLAS standard has enabled interoperability
                  between different linear algebra libraries, thereby boosting
                  the success of scientific computing, in particular in
                  scientific HPC. Despite numerous efforts in the past, the
                  community has not yet agreed on a standardization for sparse
                  linear algebra operations due to numerous reasons. One is the
                  fact that sparse linear algebra objects allow for many
                  different storage formats, and different hardware may favor
                  different storage formats. This makes the definition of a
                  FORTRAN-style all-circumventing interface extremely
                  challenging. Another reason is that opposed to dense linear
                  algebra functionality, in sparse linear algebra, the size of
                  the sparse data structure for the operation result is not
                  always known prior to the information. Furthermore, as opposed
                  to the standardization effort for dense linear algebra, we are
                  late in the technology readiness cycle, and many
                  production-ready software libraries using sparse linear
                  algebra routines have implemented and committed to their own
                  sparse BLAS interface. At the same time, there exists a demand
                  for standardization that would improve interoperability, and
                  sustainability, and allow for easier integration of building
                  blocks. In an inclusive, cross-institutional effort involving
                  numerous academic institutions, US National Labs, and
                  industry, we spent two years designing a hardware-portable
                  interface for basic sparse linear algebra functionality that
                  serves the user needs and is compatible with the different
                  interfaces currently used by different vendors. In this paper,
                  we present a C++ API for sparse linear algebra functionality,
                  discuss the design choices, and detail how software developers
                  preserve a lot of freedom in terms of how to implement
                  functionality behind this API.}
}

@article{smith-2022-concur-graph,
  author =       {Smith, Emory and Kuntz, Shannon and Riedy,
                  Jason and Deneroff, Martin},
  title =        {Concurrent Graph Queries on the Lucata
                  Pathfinder},
  journal =      {CoRR},
  year =         2022,
  url =          {http://arxiv.org/abs/2209.11889v1},
  abstract =     {High-performance analysis of unstructured
                  data like graphs now is critical for
                  applications ranging from business
                  intelligence to genome analysis. Towards
                  this, data centers hold large graphs in
                  memory to serve multiple concurrent queries
                  from different users. Even a single analysis
                  often explores multiple options. Current
                  computing architectures often are not the
                  most time- or energy-efficient solutions.
                  The novel Lucata Pathfinder architecture
                  tackles this problem, combining migratory
                  threads for low-latency reading with
                  memory-side processing for high-performance
                  accumulation. One hundred to 750 concurrent
                  breadth-first searches (BFS) all achieve
                  end-to-end speed-ups of 81 \% to 97 \% over
                  one-at-a-time queries on a graph with 522M
                  edges. Comparing to RedisGraph running on a
                  large Intel-based server, the Pathfinder
                  achieves a 19$\times$ speed-up running 128
                  BFS queries concurrently. The Pathfinder
                  also efficiently supports a mix of
                  concurrent analyses, demonstrated with
                  connected components and BFS.},
  archivePrefix ={arXiv},
  eprint =       {2209.11889},
  primaryClass = {cs.DC},
}

@article{demmel-2022-propos-consis,
  author =       {Demmel, James and Dongarra, Jack and Gates,
                  Mark and Henry, Greg and Langou, Julien and
                  Li, Xiaoye and Luszczek, Piotr and Pereira,
                  Weslley and Riedy, Jason and
                  Rubio-González, Cindy},
  title =        {Proposed Consistent Exception Handling for
                  the {BLAS} and {LAPACK}},
  journal =      {CoRR},
  year =         2022,
  url =          {http://arxiv.org/abs/2207.09281v1},
  abstract =     {Numerical exceptions, which may be caused by
                  overflow, operations like division by 0 or
                  sqrt(-1), or convergence failures, are
                  unavoidable in many cases, in particular
                  when software is used on unforeseen and
                  difficult inputs. As more aspects of society
                  become automated, e.g., self-driving cars,
                  health monitors, and cyber-physical systems
                  more generally, it is becoming increasingly
                  important to design software that is
                  resilient to exceptions, and that responds
                  to them in a consistent way. Consistency is
                  needed to allow users to build higher-level
                  software that is also resilient and
                  consistent (and so on recursively). In this
                  paper we explore the design space of
                  consistent exception handling for the widely
                  used BLAS and LAPACK linear algebra
                  libraries, pointing out a variety of
                  instances of inconsistent exception handling
                  in the current versions, and propose a new
                  design that balances consistency,
                  complexity, ease of use, and performance.
                  Some compromises are needed, because there
                  are preexisting inconsistencies that are
                  outside our control, including in or between
                  existing vendor BLAS implementations,
                  different programming languages, and even
                  compilers for the same programming language.
                  And user requests from our surveys are quite
                  diverse. We also propose our design as a
                  possible model for other numerical software,
                  and welcome comments on our design choices.},
  archivePrefix ={arXiv},
  eprint =       {2207.09281},
  primaryClass = {cs.MS},
}

@TechReport{ieee754-2019,
  note =         {(committee member and contributor)},
  author = {{IEEE 754 Committee}},
  key =          {IEEE Std 754-2019},
  journal =      {IEEE Std 754-2019},
  type =         {IEEE Std},
  number =       {754-2019},
  title =        {{IEEE} Standard for Floating-Point Arithmetic},
  year =         2019,
  pages =        {1 -- 83},
  institution =  {Microprocessor Standards Committee of the IEEE Computer Society},
  OPTabstract =     {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.},
  keywords =     {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
  OPTdoi =          {10.1109/IEEESTD.2008.4610935},
  isbn =         {978-1-5044-5897-9},
  address =      {New York, NY},
  OPTmonth =        aug,
  OPTdom =          29,
  projtag =      {ieee754},
  url = { https://ieeexplore.ieee.org/servlet/opac?punumber=8739148},
}

@article{DBLP:journals/corr/abs-1901-02775,
  author    = {Eric R. Hein and
               Srinivas Eswar and
               Abdurrahman Yasar and
               Jiajia Li and
               Jeffrey S. Young and
               Thomas M. Conte and
               {\"{U}}mit V. {\c{C}}ataly{\"{u}}rek and
               Rich Vuduc and
               E. Jason Riedy and
               Bora U{\c{c}}ar},
  title     = {Programming Strategies for Irregular Algorithms on the {Emu} {Chick}},
  journal   = {CoRR},
  volume    = {abs/1901.02775},
  year      = {2019},
  url       = {http://arxiv.org/abs/1901.02775},
  archivePrefix = {arXiv},
  OPTeprint    = {1901.02775},
  timestamp = {Sat, 02 Feb 2019 12:19:05 +0100},
  abstract =     {The Emu Chick prototype implements migratory
                  memory-side processing in a novel hardware
                  system. Rather than transferring large amounts of
                  data across the system interconnect, the Emu Chick
                  moves lightweight thread contexts to near-memory
                  cores before the beginning of each remote memory
                  read. Previous work has characterized the
                  performance of the Chick prototype in terms of
                  memory bandwidth and programming differences from
                  more typical, non-migratory platforms, but there has
                  not yet been an analysis of algorithms on this
                  system.  This work evaluates irregular algorithms
                  that could benefit from the lightweight, memory-side
                  processing of the Chick and demonstrates techniques
                  and optimization strategies for achieving
                  performance in sparse matrix-vector multiply
                  operation (SpMV), breadth-first search (BFS), and
                  graph alignment across up to eight distributed nodes
                  encompassing 64 nodelets in the Chick system. We
                  also define and justify relative metrics to compare
                  prototype FPGA-based hardware with established ASIC
                  architectures. The Chick currently supports up to
                  68x scaling for graph alignment, 80 MTEPS for BFS on
                  balanced graphs, and 50\ \% of measured STREAM
                  bandwidth for SpMV.},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1901-02775},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DBLP:journals/corr/abs-1811-03743,
  author    = {Patrick Lavin and
               E. Jason Riedy and
               Rich Vuduc and
               Jeffrey Young},
  title     = {Spatter: {A} Benchmark Suite for Evaluating Sparse Access Patterns},
  journal   = {CoRR},
  volume    = {abs/1811.03743},
  year      = {2018},
  url       = {http://arxiv.org/abs/1811.03743},
  archivePrefix = {arXiv},
  OPTeprint    = {1811.03743},
  timestamp = {Fri, 23 Nov 2018 12:43:51 +0100},
  abstract =     {Recent characterizations of data movement performance have evaluated optimizations for dense and blocked accesses used by accelerators like GPUs and Xeon Phi, but sparse access patterns like scatter and gather are still not well understood across current and emerging architectures. We propose a tunable benchmark suite, Spatter, that allows users to characterize scatter, gather, and related sparse access patterns at a low level across multiple backends, including CUDA, OpenCL, and OpenMP. Spatter also allows users to vary the block size and amount of data that is moved to create a more comprehensive picture of sparse access patterns and to model patterns that are found in real applications.  With Spatter we aim to characterize the performance of memory systems in a novel way by evaluating how the density of accesses compares against real-world effective memory bandwidths (measured by STREAM) and how it can be compared across widely varying architectures including GPUs and x86, ARM, and Power CPUs. We demonstrate how Spatter can be used to generate analysis plots comparing different architectures and show that current GPU systems achieve up to 65\% of STREAM bandwidth for sparse accesses and are more energy efficient in doing so for several different sparsity patterns. Our future plans for the spatter benchmark are to use these results to predict the impact of new memory access primitives on various architectures, develop backends for novel hardware like FPGAs and the Emu Chick, and automate testing so that users can perform their own sparse access studies.},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1811-03743},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DBLP:journals/corr/abs-1809-07696,
  author    = {Jeffrey Young and
               Eric R. Hein and
               Srinivas Eswar and
               Patrick Lavin and
               Jiajia Li and
               E. Jason Riedy and
               Richard W. Vuduc and
               Tom Conte},
  title     = {A Microbenchmark Characterization of the {Emu} {Chick}},
  journal   = {CoRR},
  volume    = {abs/1809.07696},
  year      = {2018},
  url       = {http://arxiv.org/abs/1809.07696},
  archivePrefix = {arXiv},
  OPTeprint    = {1809.07696},
  timestamp = {Sat, 02 Feb 2019 12:19:08 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1809-07696},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1808-06334,
  author    = {Will Powell and
               E. Jason Riedy and
               Jeffrey S. Young and
               Thomas M. Conte},
  title     = {Wrangling Rogues: Managing Experimental Post-Moore Architectures},
  journal   = {CoRR},
  volume    = {abs/1808.06334},
  year      = {2018},
  url       = {http://arxiv.org/abs/1808.06334},
  archivePrefix = {arXiv},
  OPTeprint    = {1808.06334},
  timestamp = {Sun, 02 Sep 2018 15:01:54 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1808-06334},
  abstract =     {The Rogues Gallery is a new experimental testbed that is focused on tackling "rogue" architectures for the Post-Moore era of computing. While some of these devices have roots in the embedded and high-performance computing spaces, managing current and emerging technologies provides a challenge for system administration that are not always foreseen in traditional data center environments.  We present an overview of the motivations and design of the initial Rogues Gallery testbed and cover some of the unique challenges that we have seen and foresee with upcoming hardware prototypes for future post-Moore research. Specifically, we cover the networking, identity management, scheduling of resources, and tools and sensor access aspects of the Rogues Gallery and techniques we have developed to manage these new platforms.},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/DukhanVR16,
  author    = {Marat Dukhan and
               Richard W. Vuduc and
               E. Jason Riedy},
  title     = {Wanted: Floating-Point Add Round-off Error instruction},
  journal   = {CoRR},
  volume    = {abs/1603.00491},
  year      = {2016},
  url       = {http://arxiv.org/abs/1603.00491},
  archivePrefix = {arXiv},
  eprint    = {1603.00491},
  timestamp = {Mon, 13 Aug 2018 16:48:45 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/DukhanVR16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Holder:2016:CFC:2980765.2980770,
  author =       {Holder, Lawrence B. and Caceres, Rajmonda and Gleich, David F. and Riedy, Jason and Khan, Maleq and Chawla, Nitesh V. and Kumar, Ravi and Wu, Yinghui and Klymko, Christine and Eliassi-Rad, Tina and Prakash, Aditya},
  title =        {Current and Future Challenges in Mining Large Networks: Report on the Second SDM Workshop on Mining Networks and Graphs},
  journal =      {SIGKDD Explorations Newsletter},
  issue_date =   {June 2016},
  volume =       18,
  number =       1,
  month =        aug,
  year =         2016,
  issn =         {1931-0145},
  pages =        {39--45},
  numpages =     7,
  doi =          {10.1145/2980765.2980770},
  acmid =        2980770,
  publisher =    {ACM},
  address =      {New York, NY, USA},
  keywords =     {Network mining, big data, challenges, graph mining},
  projtag =      {grateful, hpda}
}


@article{DBLP:journals/corr/SwensonSPPRBV13,
  author    = {Shel Swenson and
               Yogesh Simmhan and
               Viktor K. Prasanna and
               Manish Parashar and
               E. Jason Riedy and
               David A. Bader and
               Richard W. Vuduc},
  title     = {Sustainable Software Development for Next-Gen Sequencing {(NGS)} Bioinformatics
               on Emerging Platforms},
  journal   = {CoRR},
  volume    = {abs/1309.1828},
  year      = {2013},
  url       = {http://arxiv.org/abs/1309.1828},
  archivePrefix = {arXiv},
  OPTeprint    = {1309.1828},
  timestamp = {Mon, 13 Aug 2018 16:49:01 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/SwensonSPPRBV13},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@TechReport{seed-set-tr,
  author =       {Riedy, Jason and Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa},
  ejr-withauthor ={Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa},
  title =        {Detecting Communities from Given Seeds in Social Networks},
  institution =  {Georgia Institute of Technology},
  year =         2011,
  number =       {GT-CSE-11-01},
  month =        feb,
  dom =          22,
  file =         {material/GT-CSE-11-01.pdf},
  url =          {http://hdl.handle.net/1853/36980},
  role =         {techreport},
  abstract =     {Analyzing massive social networks challenges both high-performance computers and human understanding. These massive networks cannot be visualized easily, and their scale makes applying complex analysis methods computationally expensive. We present a region-growing method for finding a smaller, more tractable subgraph, a community, given a few example seed vertices. Unlike existing work, we focus on a small number of seed vertices, from two to a few dozen. We also present the first comparison between five algorithms for expanding a small seed set into a community. Our comparison applies these algorithms to an R-MAT generated graph component with 240 thousand vertices and 32 million edges and evaluates the community size, modularity, Kullback-Leibler divergence, conductance, and clustering coefficient. We find that our new algorithm with a local modularity maximizing heuristic based on Clauset, Newman, and Moore performs very well when the output is limited to 100 or 1000 vertices. When run without a vertex size limit, a heuristic from McCloskey and Bader generates communities containing around 60\% of the graph's vertices and having a small conductance and modularity appropriate to the result size. A personalized PageRank algorithm based on Andersen, Lang, and Chung also performs well with respect to our metrics.},
  OPTtags =         {graph; social network},
  projtag =      {cassmt}
}

@TechReport{ieee754-2008,
  note =         {(committee member and contributor)},
  author = {{IEEE 754 Committee}},
  key =          {IEEE Std 754-2008},
  journal =      {IEEE Std 754-2008},
  type =         {IEEE Std},
  number =       {754-2008},
  title =        {{IEEE} Standard for Floating-Point Arithmetic},
  year =         2008,
  pages =        {1 -- 70},
  institution =  {Microprocessor Standards Committee of the IEEE Computer Society},
  abstract =     {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.},
  keywords =     {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
  doi =          {10.1109/IEEESTD.2008.4610935},
  isbn =         {978-0-7381-5753-5},
  address =      {New York, NY},
  month =        aug,
  dom =          29,
  committee =    {Alex Aiken and Matthew Applegate and David Bailey and Steve Bass and Dileep Bhandarkar and Mahesh Bhat and David Bindel and Sylvie Boldo and Stephen Canon and Steven R. Carlough and Marius Cornea and Mike Cowlishaw (editor) and John H. Crawford and Joseph D. Darcy and Marc Daumas and Bob Davis and Mark Davis and Dick Delp and Jim Demmel and Mark A. Erle and Hossam A. H. Fahmy and J.P. Fasano and Richard Fateman and Eric Feng and Warren E. Ferguson and Alex Fit-Florea and Laurent Fournier and Chip Freitag and Ivan Godard and Roger A. Golliver and David Gustafson and Michel Hack and John R. Harrison and John Hauser and Yozo Hida and Chris N. Hinds and Graydon Hoare and David G. Hough and Jerry Huck and Jim Hull and Michael Ingrassia and David V. James and Rick James and William Kahan and John Kapernick and Richard Karpinski and Jeff Kidder and Plamen Koev and Ren-Cang Li and Zhishun Alex Liu and Raymond Mak and Peter Markstein and David Matula and Guillaume Melquiond and Nobuyoshi Mori and Ricardo Morin and Ned Nedialkov and Craig Nelson and Stuart Oberman and Jon Okada and Ian Ollmann and Michael Parks and Tom Pittman and Eric Postpischil and Jason Riedy and Debjit Das Sarma and Eric M. Schwarz and David Scott and Don Senzig and Ilya Sharapov and Jim Shearer and Michael Siu and Ron Smith and Chuck Stevens and Peter Tang and Pamela J. Taylor and James W. Thomas and Brandon Thompson and Wendy Thrash and Neil Toda and Son Dao Trong and Leonard Tsai and Charles Tsen and Fred Tydeman and Liang Kai Wang and Scott Westbrook and Steve Winkler and Anthony Wood and Umit Yalcinalp and Fred Zemke and Paul Zimmermann and Dan Zuras (chair)},
  projtag =      {ieee754},
}

@TechReport{nonneg-house-lawn,
  author =       {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Mark Frederick Hoemmen and Yozo Hida},
  title =        {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {Householder} {$QR$}},
  institution =  {Netlib},
  year =         2008,
  type =         {LAPACK Working Note},
  number =       203,
  lawn =         203,
  month =        may,
  dom =          30,
  note =         {Also issued as UCB/EECS-2008-76; modified from SISC version.},
  role =         {techreport},
  OPTtags =         {lawn; lapack; householder; qr},
  url =          {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf},
  other-url =    {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html},
  file =         {material/lawn203.pdf},
  abstract =     {The Householder reflections used in LAPACK's $QR$ factorization leave positive and negative real entries along $R$'s diagonal. This is sufficient for most applications of $QR$ factorizations, but a few require that $R$ have a nonnegative diagonal. This note describes a new Householder generation routine to produce a nonnegative diagonal. Additionally, we find that scanning for trailing zeros in the generated reflections leads to large performance improvements when applying reflections with many trailing zeros. Factoring low-profile matrices, those with nonzero entries mostly near the diagonal (e.g., band matrices), now require far fewer operations. For example, $QR$ factorization of matrices with profile width b that are stored densely in an $n\times n$ matrix improves from $O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK 3.2.},
  projtag =      {lapack}
}

@TechReport{lawn188,
  author =       {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title =        {Extra-precise iterative refinement for overdetermined least squares problems},
  type =         {LAPACK Working Note},
  institution =  {Netlib},
  year =         2007,
  number =       188,
  month =        may,
  dom =          31,
  url =          {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf},
  file =         {material/lawn188.pdf},
  other-url =    {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note =         {Also issued as UCB/EECS-2007-77; version accepted for TOMS.},
  abstract =     {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.},
  role =         {techreport},
  OPTtags =         {lawn; lapack; least squares; floating point},
  projtag =      {lapack, ieee754}
}

@TechReport{lapack-prospectus-lawn,
  author =       {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  ejr-withauthor ={James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov},
  title =        {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries},
  institution =  {Netlib},
  year =         2007,
  type =         {LAPACK Working Note},
  number =       181,
  lawn =         181,
  month =        feb,
  note =         {Also issued as UT-CS-07-592},
  role =         {techreport},
  OPTtags =         {lawn; lapack},
  url =          {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf},
  file =         {material/lawn181.pdf},
  projtag =      {lapack}
}

@TechReport{tridiag-lawn,
  author =       {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel},
  ejr-withauthor ={Osni A. Marques and Christof V{\"o}mel},
  title =        {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers},
  type =         {LAPACK Working Note},
  number =       172,
  lawn =         172,
  institution =  {Netlib},
  month =        sep,
  year =         2005,
  dom =          30,
  note =         {Also issued as UCB//CSD-05-1414; expanded from SISC version},
  URL =          {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf},
  other-url =    {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html},
  role =         {techreport},
  OPTtags =         {lawn; lapack; floating point; ieee754; eigenvalue},
  file =         {material/lawn172.pdf},
  abstract =     {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.},
  projtag =      {lapack, ieee754}
}

@TechReport{axb-itref-lawn,
  author =       {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor ={James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee},
  title =        {Error bounds from extra-precise iterative refinement},
  type =         {LAPACK Working Note},
  institution =  {Netlib},
  year =         2005,
  number =       165,
  lawn =         165,
  month =        feb,
  dom =          3,
  role =         {techreport},
  OPTtags =         {lawn; lapack; linear algebra; floating point},
  other-url =    {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note =         {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and LBNL-56965; expanded from TOMS version},
  url =          {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf},
  file =         {material/lawn165.pdf},
  abstract =     {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations, where the residual is computed with extra precision. This algorithm was originally proposed in the 1960s [6, 22] as a means to compute very accurate solutions to all but the most ill-conditioned linear systems of equations. However two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard [5] has recently removed the first obstacle. To overcome the second obstacle, we show how a single application of iterative refinement can be used to compute an error bound in any norm at small cost, and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound.  We report extensive test results on over 6.2 million matrices of dimension 5, 10, 100, and 1000.  As long as a normwise (resp. componentwise) condition number computed by the algorithm is less than $1 / \operatorname{max}\{10, \sqrt{n}\}\varepsilon_w$ , the computed normwise (resp. componentwise) error bound is at most $2 \operatorname{max}\{10, \sqrt{n}\} \cdot \varepsilon_w$ , and indeed bounds the true error. Here, $n$ is the matrix dimension and $\varepsilon_w$ is single precision roundoff error. For worse conditioned problems, we get similarly small correct error bounds in over 89.4\% of cases.},
  projtag =      {lapack, ieee754}
}