@STRING{jan = "January"} @STRING{feb = "February"} @STRING{mar = "March"} @STRING{apr = "April"} @STRING{may = "May"} @STRING{jun = "June"} @STRING{jul = "July"} @STRING{aug = "August"} @STRING{sep = "September"} @STRING{oct = "October"} @STRING{nov = "November"} @STRING{dec = "December"} @misc{abdelfattah2024interfacesparselinearalgebra, title = {Interface for Sparse Linear Algebra Operations}, author = {Ahmad Abdelfattah and Willow Ahrens and Hartwig Anzt and Chris Armstrong and Ben Brock and Aydin Buluc and Federico Busato and Terry Cojean and Tim Davis and Jim Demmel and Grace Dinh and David Gardener and Jan Fiala and Mark Gates and Azzam Haider and Toshiyuki Imamura and Pedro Valero Lara and Jose Moreira and Sherry Li and Piotr Luszczek and Max Melichenko and Jose Moeira and Yvan Mokwinski and Riley Murray and Spencer Patty and Slaven Peles and Tobias Ribizel and Jason Riedy and Siva Rajamanickam and Piyush Sao and Manu Shantharam and Keita Teranishi and Stan Tomov and Yu-Hsiang Tsai and Heiko Weichelt}, year = 2024, eprint = {2411.13259}, archivePrefix ={arXiv}, primaryClass = {cs.MS}, url = {https://arxiv.org/abs/2411.13259}, abstract = {The standardization of an interface for dense linear algebra operations in the BLAS standard has enabled interoperability between different linear algebra libraries, thereby boosting the success of scientific computing, in particular in scientific HPC. Despite numerous efforts in the past, the community has not yet agreed on a standardization for sparse linear algebra operations due to numerous reasons. One is the fact that sparse linear algebra objects allow for many different storage formats, and different hardware may favor different storage formats. This makes the definition of a FORTRAN-style all-circumventing interface extremely challenging. Another reason is that opposed to dense linear algebra functionality, in sparse linear algebra, the size of the sparse data structure for the operation result is not always known prior to the information. Furthermore, as opposed to the standardization effort for dense linear algebra, we are late in the technology readiness cycle, and many production-ready software libraries using sparse linear algebra routines have implemented and committed to their own sparse BLAS interface. At the same time, there exists a demand for standardization that would improve interoperability, and sustainability, and allow for easier integration of building blocks. In an inclusive, cross-institutional effort involving numerous academic institutions, US National Labs, and industry, we spent two years designing a hardware-portable interface for basic sparse linear algebra functionality that serves the user needs and is compatible with the different interfaces currently used by different vendors. In this paper, we present a C++ API for sparse linear algebra functionality, discuss the design choices, and detail how software developers preserve a lot of freedom in terms of how to implement functionality behind this API.} } @article{smith-2022-concur-graph, author = {Smith, Emory and Kuntz, Shannon and Riedy, Jason and Deneroff, Martin}, title = {Concurrent Graph Queries on the Lucata Pathfinder}, journal = {CoRR}, year = 2022, url = {http://arxiv.org/abs/2209.11889v1}, abstract = {High-performance analysis of unstructured data like graphs now is critical for applications ranging from business intelligence to genome analysis. Towards this, data centers hold large graphs in memory to serve multiple concurrent queries from different users. Even a single analysis often explores multiple options. Current computing architectures often are not the most time- or energy-efficient solutions. The novel Lucata Pathfinder architecture tackles this problem, combining migratory threads for low-latency reading with memory-side processing for high-performance accumulation. One hundred to 750 concurrent breadth-first searches (BFS) all achieve end-to-end speed-ups of 81 \% to 97 \% over one-at-a-time queries on a graph with 522M edges. Comparing to RedisGraph running on a large Intel-based server, the Pathfinder achieves a 19$\times$ speed-up running 128 BFS queries concurrently. The Pathfinder also efficiently supports a mix of concurrent analyses, demonstrated with connected components and BFS.}, archivePrefix ={arXiv}, eprint = {2209.11889}, primaryClass = {cs.DC}, } @article{demmel-2022-propos-consis, author = {Demmel, James and Dongarra, Jack and Gates, Mark and Henry, Greg and Langou, Julien and Li, Xiaoye and Luszczek, Piotr and Pereira, Weslley and Riedy, Jason and Rubio-González, Cindy}, title = {Proposed Consistent Exception Handling for the {BLAS} and {LAPACK}}, journal = {CoRR}, year = 2022, url = {http://arxiv.org/abs/2207.09281v1}, abstract = {Numerical exceptions, which may be caused by overflow, operations like division by 0 or sqrt(-1), or convergence failures, are unavoidable in many cases, in particular when software is used on unforeseen and difficult inputs. As more aspects of society become automated, e.g., self-driving cars, health monitors, and cyber-physical systems more generally, it is becoming increasingly important to design software that is resilient to exceptions, and that responds to them in a consistent way. Consistency is needed to allow users to build higher-level software that is also resilient and consistent (and so on recursively). In this paper we explore the design space of consistent exception handling for the widely used BLAS and LAPACK linear algebra libraries, pointing out a variety of instances of inconsistent exception handling in the current versions, and propose a new design that balances consistency, complexity, ease of use, and performance. Some compromises are needed, because there are preexisting inconsistencies that are outside our control, including in or between existing vendor BLAS implementations, different programming languages, and even compilers for the same programming language. And user requests from our surveys are quite diverse. We also propose our design as a possible model for other numerical software, and welcome comments on our design choices.}, archivePrefix ={arXiv}, eprint = {2207.09281}, primaryClass = {cs.MS}, } @TechReport{ieee754-2019, note = {(committee member and contributor)}, author = {{IEEE 754 Committee}}, key = {IEEE Std 754-2019}, journal = {IEEE Std 754-2019}, type = {IEEE Std}, number = {754-2019}, title = {{IEEE} Standard for Floating-Point Arithmetic}, year = 2019, pages = {1 -- 83}, institution = {Microprocessor Standards Committee of the IEEE Computer Society}, OPTabstract = {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.}, keywords = {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal}, OPTdoi = {10.1109/IEEESTD.2008.4610935}, isbn = {978-1-5044-5897-9}, address = {New York, NY}, OPTmonth = aug, OPTdom = 29, projtag = {ieee754}, url = { https://ieeexplore.ieee.org/servlet/opac?punumber=8739148}, } @article{DBLP:journals/corr/abs-1901-02775, author = {Eric R. Hein and Srinivas Eswar and Abdurrahman Yasar and Jiajia Li and Jeffrey S. Young and Thomas M. Conte and {\"{U}}mit V. {\c{C}}ataly{\"{u}}rek and Rich Vuduc and E. Jason Riedy and Bora U{\c{c}}ar}, title = {Programming Strategies for Irregular Algorithms on the {Emu} {Chick}}, journal = {CoRR}, volume = {abs/1901.02775}, year = {2019}, url = {http://arxiv.org/abs/1901.02775}, archivePrefix = {arXiv}, OPTeprint = {1901.02775}, timestamp = {Sat, 02 Feb 2019 12:19:05 +0100}, abstract = {The Emu Chick prototype implements migratory memory-side processing in a novel hardware system. Rather than transferring large amounts of data across the system interconnect, the Emu Chick moves lightweight thread contexts to near-memory cores before the beginning of each remote memory read. Previous work has characterized the performance of the Chick prototype in terms of memory bandwidth and programming differences from more typical, non-migratory platforms, but there has not yet been an analysis of algorithms on this system. This work evaluates irregular algorithms that could benefit from the lightweight, memory-side processing of the Chick and demonstrates techniques and optimization strategies for achieving performance in sparse matrix-vector multiply operation (SpMV), breadth-first search (BFS), and graph alignment across up to eight distributed nodes encompassing 64 nodelets in the Chick system. We also define and justify relative metrics to compare prototype FPGA-based hardware with established ASIC architectures. The Chick currently supports up to 68x scaling for graph alignment, 80 MTEPS for BFS on balanced graphs, and 50\ \% of measured STREAM bandwidth for SpMV.}, biburl = {https://dblp.org/rec/bib/journals/corr/abs-1901-02775}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{DBLP:journals/corr/abs-1811-03743, author = {Patrick Lavin and E. Jason Riedy and Rich Vuduc and Jeffrey Young}, title = {Spatter: {A} Benchmark Suite for Evaluating Sparse Access Patterns}, journal = {CoRR}, volume = {abs/1811.03743}, year = {2018}, url = {http://arxiv.org/abs/1811.03743}, archivePrefix = {arXiv}, OPTeprint = {1811.03743}, timestamp = {Fri, 23 Nov 2018 12:43:51 +0100}, abstract = {Recent characterizations of data movement performance have evaluated optimizations for dense and blocked accesses used by accelerators like GPUs and Xeon Phi, but sparse access patterns like scatter and gather are still not well understood across current and emerging architectures. We propose a tunable benchmark suite, Spatter, that allows users to characterize scatter, gather, and related sparse access patterns at a low level across multiple backends, including CUDA, OpenCL, and OpenMP. Spatter also allows users to vary the block size and amount of data that is moved to create a more comprehensive picture of sparse access patterns and to model patterns that are found in real applications. With Spatter we aim to characterize the performance of memory systems in a novel way by evaluating how the density of accesses compares against real-world effective memory bandwidths (measured by STREAM) and how it can be compared across widely varying architectures including GPUs and x86, ARM, and Power CPUs. We demonstrate how Spatter can be used to generate analysis plots comparing different architectures and show that current GPU systems achieve up to 65\% of STREAM bandwidth for sparse accesses and are more energy efficient in doing so for several different sparsity patterns. Our future plans for the spatter benchmark are to use these results to predict the impact of new memory access primitives on various architectures, develop backends for novel hardware like FPGAs and the Emu Chick, and automate testing so that users can perform their own sparse access studies.}, biburl = {https://dblp.org/rec/bib/journals/corr/abs-1811-03743}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{DBLP:journals/corr/abs-1809-07696, author = {Jeffrey Young and Eric R. Hein and Srinivas Eswar and Patrick Lavin and Jiajia Li and E. Jason Riedy and Richard W. Vuduc and Tom Conte}, title = {A Microbenchmark Characterization of the {Emu} {Chick}}, journal = {CoRR}, volume = {abs/1809.07696}, year = {2018}, url = {http://arxiv.org/abs/1809.07696}, archivePrefix = {arXiv}, OPTeprint = {1809.07696}, timestamp = {Sat, 02 Feb 2019 12:19:08 +0100}, biburl = {https://dblp.org/rec/bib/journals/corr/abs-1809-07696}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{DBLP:journals/corr/abs-1808-06334, author = {Will Powell and E. Jason Riedy and Jeffrey S. Young and Thomas M. Conte}, title = {Wrangling Rogues: Managing Experimental Post-Moore Architectures}, journal = {CoRR}, volume = {abs/1808.06334}, year = {2018}, url = {http://arxiv.org/abs/1808.06334}, archivePrefix = {arXiv}, OPTeprint = {1808.06334}, timestamp = {Sun, 02 Sep 2018 15:01:54 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/abs-1808-06334}, abstract = {The Rogues Gallery is a new experimental testbed that is focused on tackling "rogue" architectures for the Post-Moore era of computing. While some of these devices have roots in the embedded and high-performance computing spaces, managing current and emerging technologies provides a challenge for system administration that are not always foreseen in traditional data center environments. We present an overview of the motivations and design of the initial Rogues Gallery testbed and cover some of the unique challenges that we have seen and foresee with upcoming hardware prototypes for future post-Moore research. Specifically, we cover the networking, identity management, scheduling of resources, and tools and sensor access aspects of the Rogues Gallery and techniques we have developed to manage these new platforms.}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{DBLP:journals/corr/DukhanVR16, author = {Marat Dukhan and Richard W. Vuduc and E. Jason Riedy}, title = {Wanted: Floating-Point Add Round-off Error instruction}, journal = {CoRR}, volume = {abs/1603.00491}, year = {2016}, url = {http://arxiv.org/abs/1603.00491}, archivePrefix = {arXiv}, eprint = {1603.00491}, timestamp = {Mon, 13 Aug 2018 16:48:45 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/DukhanVR16}, bibsource = {dblp computer science bibliography, https://dblp.org} } @article{Holder:2016:CFC:2980765.2980770, author = {Holder, Lawrence B. and Caceres, Rajmonda and Gleich, David F. and Riedy, Jason and Khan, Maleq and Chawla, Nitesh V. and Kumar, Ravi and Wu, Yinghui and Klymko, Christine and Eliassi-Rad, Tina and Prakash, Aditya}, title = {Current and Future Challenges in Mining Large Networks: Report on the Second SDM Workshop on Mining Networks and Graphs}, journal = {SIGKDD Explorations Newsletter}, issue_date = {June 2016}, volume = 18, number = 1, month = aug, year = 2016, issn = {1931-0145}, pages = {39--45}, numpages = 7, doi = {10.1145/2980765.2980770}, acmid = 2980770, publisher = {ACM}, address = {New York, NY, USA}, keywords = {Network mining, big data, challenges, graph mining}, projtag = {grateful, hpda} } @article{DBLP:journals/corr/SwensonSPPRBV13, author = {Shel Swenson and Yogesh Simmhan and Viktor K. Prasanna and Manish Parashar and E. Jason Riedy and David A. Bader and Richard W. Vuduc}, title = {Sustainable Software Development for Next-Gen Sequencing {(NGS)} Bioinformatics on Emerging Platforms}, journal = {CoRR}, volume = {abs/1309.1828}, year = {2013}, url = {http://arxiv.org/abs/1309.1828}, archivePrefix = {arXiv}, OPTeprint = {1309.1828}, timestamp = {Mon, 13 Aug 2018 16:49:01 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/SwensonSPPRBV13}, bibsource = {dblp computer science bibliography, https://dblp.org} } @TechReport{seed-set-tr, author = {Riedy, Jason and Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa}, ejr-withauthor ={Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa}, title = {Detecting Communities from Given Seeds in Social Networks}, institution = {Georgia Institute of Technology}, year = 2011, number = {GT-CSE-11-01}, month = feb, dom = 22, file = {material/GT-CSE-11-01.pdf}, url = {http://hdl.handle.net/1853/36980}, role = {techreport}, abstract = {Analyzing massive social networks challenges both high-performance computers and human understanding. These massive networks cannot be visualized easily, and their scale makes applying complex analysis methods computationally expensive. We present a region-growing method for finding a smaller, more tractable subgraph, a community, given a few example seed vertices. Unlike existing work, we focus on a small number of seed vertices, from two to a few dozen. We also present the first comparison between five algorithms for expanding a small seed set into a community. Our comparison applies these algorithms to an R-MAT generated graph component with 240 thousand vertices and 32 million edges and evaluates the community size, modularity, Kullback-Leibler divergence, conductance, and clustering coefficient. We find that our new algorithm with a local modularity maximizing heuristic based on Clauset, Newman, and Moore performs very well when the output is limited to 100 or 1000 vertices. When run without a vertex size limit, a heuristic from McCloskey and Bader generates communities containing around 60\% of the graph's vertices and having a small conductance and modularity appropriate to the result size. A personalized PageRank algorithm based on Andersen, Lang, and Chung also performs well with respect to our metrics.}, OPTtags = {graph; social network}, projtag = {cassmt} } @TechReport{ieee754-2008, note = {(committee member and contributor)}, author = {{IEEE 754 Committee}}, key = {IEEE Std 754-2008}, journal = {IEEE Std 754-2008}, type = {IEEE Std}, number = {754-2008}, title = {{IEEE} Standard for Floating-Point Arithmetic}, year = 2008, pages = {1 -- 70}, institution = {Microprocessor Standards Committee of the IEEE Computer Society}, abstract = {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.}, keywords = {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal}, doi = {10.1109/IEEESTD.2008.4610935}, isbn = {978-0-7381-5753-5}, address = {New York, NY}, month = aug, dom = 29, committee = {Alex Aiken and Matthew Applegate and David Bailey and Steve Bass and Dileep Bhandarkar and Mahesh Bhat and David Bindel and Sylvie Boldo and Stephen Canon and Steven R. Carlough and Marius Cornea and Mike Cowlishaw (editor) and John H. Crawford and Joseph D. Darcy and Marc Daumas and Bob Davis and Mark Davis and Dick Delp and Jim Demmel and Mark A. Erle and Hossam A. H. Fahmy and J.P. Fasano and Richard Fateman and Eric Feng and Warren E. Ferguson and Alex Fit-Florea and Laurent Fournier and Chip Freitag and Ivan Godard and Roger A. Golliver and David Gustafson and Michel Hack and John R. Harrison and John Hauser and Yozo Hida and Chris N. Hinds and Graydon Hoare and David G. Hough and Jerry Huck and Jim Hull and Michael Ingrassia and David V. James and Rick James and William Kahan and John Kapernick and Richard Karpinski and Jeff Kidder and Plamen Koev and Ren-Cang Li and Zhishun Alex Liu and Raymond Mak and Peter Markstein and David Matula and Guillaume Melquiond and Nobuyoshi Mori and Ricardo Morin and Ned Nedialkov and Craig Nelson and Stuart Oberman and Jon Okada and Ian Ollmann and Michael Parks and Tom Pittman and Eric Postpischil and Jason Riedy and Debjit Das Sarma and Eric M. Schwarz and David Scott and Don Senzig and Ilya Sharapov and Jim Shearer and Michael Siu and Ron Smith and Chuck Stevens and Peter Tang and Pamela J. Taylor and James W. Thomas and Brandon Thompson and Wendy Thrash and Neil Toda and Son Dao Trong and Leonard Tsai and Charles Tsen and Fred Tydeman and Liang Kai Wang and Scott Westbrook and Steve Winkler and Anthony Wood and Umit Yalcinalp and Fred Zemke and Paul Zimmermann and Dan Zuras (chair)}, projtag = {ieee754}, } @TechReport{nonneg-house-lawn, author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy}, ejr-withauthor ={James W. Demmel and Mark Frederick Hoemmen and Yozo Hida}, title = {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {Householder} {$QR$}}, institution = {Netlib}, year = 2008, type = {LAPACK Working Note}, number = 203, lawn = 203, month = may, dom = 30, note = {Also issued as UCB/EECS-2008-76; modified from SISC version.}, role = {techreport}, OPTtags = {lawn; lapack; householder; qr}, url = {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html}, file = {material/lawn203.pdf}, abstract = {The Householder reflections used in LAPACK's $QR$ factorization leave positive and negative real entries along $R$'s diagonal. This is sufficient for most applications of $QR$ factorizations, but a few require that $R$ have a nonnegative diagonal. This note describes a new Householder generation routine to produce a nonnegative diagonal. Additionally, we find that scanning for trailing zeros in the generated reflections leads to large performance improvements when applying reflections with many trailing zeros. Factoring low-profile matrices, those with nonzero entries mostly near the diagonal (e.g., band matrices), now require far fewer operations. For example, $QR$ factorization of matrices with profile width b that are stored densely in an $n\times n$ matrix improves from $O(n^3)$ to $O(n^2+nb^2)$. These routines are in LAPACK 3.2.}, projtag = {lapack} } @TechReport{lawn188, author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy}, ejr-withauthor ={James W. Demmel and Yozo Hida and Xiaoye S. Li}, title = {Extra-precise iterative refinement for overdetermined least squares problems}, type = {LAPACK Working Note}, institution = {Netlib}, year = 2007, number = 188, month = may, dom = 31, url = {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf}, file = {material/lawn188.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html}, note = {Also issued as UCB/EECS-2007-77; version accepted for TOMS.}, abstract = {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.}, role = {techreport}, OPTtags = {lawn; lapack; least squares; floating point}, projtag = {lapack, ieee754} } @TechReport{lapack-prospectus-lawn, author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, ejr-withauthor ={James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries}, institution = {Netlib}, year = 2007, type = {LAPACK Working Note}, number = 181, lawn = 181, month = feb, note = {Also issued as UT-CS-07-592}, role = {techreport}, OPTtags = {lawn; lapack}, url = {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf}, file = {material/lawn181.pdf}, projtag = {lapack} } @TechReport{tridiag-lawn, author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel}, ejr-withauthor ={Osni A. Marques and Christof V{\"o}mel}, title = {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers}, type = {LAPACK Working Note}, number = 172, lawn = 172, institution = {Netlib}, month = sep, year = 2005, dom = 30, note = {Also issued as UCB//CSD-05-1414; expanded from SISC version}, URL = {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html}, role = {techreport}, OPTtags = {lawn; lapack; floating point; ieee754; eigenvalue}, file = {material/lawn172.pdf}, abstract = {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.}, projtag = {lapack, ieee754} } @TechReport{axb-itref-lawn, author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy}, ejr-withauthor ={James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee}, title = {Error bounds from extra-precise iterative refinement}, type = {LAPACK Working Note}, institution = {Netlib}, year = 2005, number = 165, lawn = 165, month = feb, dom = 3, role = {techreport}, OPTtags = {lawn; lapack; linear algebra; floating point}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html}, note = {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and LBNL-56965; expanded from TOMS version}, url = {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf}, file = {material/lawn165.pdf}, abstract = {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations, where the residual is computed with extra precision. This algorithm was originally proposed in the 1960s [6, 22] as a means to compute very accurate solutions to all but the most ill-conditioned linear systems of equations. However two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard [5] has recently removed the first obstacle. To overcome the second obstacle, we show how a single application of iterative refinement can be used to compute an error bound in any norm at small cost, and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound. We report extensive test results on over 6.2 million matrices of dimension 5, 10, 100, and 1000. As long as a normwise (resp. componentwise) condition number computed by the algorithm is less than $1 / \operatorname{max}\{10, \sqrt{n}\}\varepsilon_w$ , the computed normwise (resp. componentwise) error bound is at most $2 \operatorname{max}\{10, \sqrt{n}\} \cdot \varepsilon_w$ , and indeed bounds the true error. Here, $n$ is the matrix dimension and $\varepsilon_w$ is single precision roundoff error. For worse conditioned problems, we get similarly small correct error bounds in over 89.4\% of cases.}, projtag = {lapack, ieee754} }