@misc{an12-streaming-ms, author = {David A. Bader and David Ediger and Jason Riedy}, ejr-withauthor = {David A. Bader and David Ediger}, title = {Streaming Graph Analytics for Massive Graphs}, howpublished = {SIAM Annual Meeting}, dom = 10, month = jul, year = {2012}, url = {http://www.slideshare.net/jasonriedy/streaming-graph-analytics-for-massive-graphs}, role = {presentation}, tags = {siam; streaming data; parallel algorithms}, address = {Minneapolis, MN}, abstract = {Emerging real-world graph problems include detecting community structure in large social networks, improving the resilience of the electric power grid, and detecting and preventing disease in human populations. The volume and richness of data combined with its rate of change renders monitoring properties at scale by static recomputation infeasible. We approach these problems with massive, fine-grained parallelism across different shared memory architectures both to compute solutions and to explore the sensitivity of these solutions to natural bias and omissions within the data.} }

@inproceedings{arith-lang, author = {David Hough and Bill Hay and Jeff Kidder and E. Jason Riedy and Guy L. Steele Jr. and Jim Thomas}, ejr-withauthor = {David Hough and Bill Hay and Jeff Kidder and Guy L. Steele Jr. and Jim Thomas}, title = {Arithmetic Interactions: From Hardware to Applications}, booktitle = {17th {IEEE} Symposium on Computer Arithmetic ({ARITH}'05)}, year = {2005}, dom = 28, month = jun, note = {See \href{http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf}{related presentation}}, isbn = {0-7695-2366-8}, role = {proceedings; panel}, tags = {ieee754; floating point}, doi = {10.1109/ARITH.2005.10}, abstract = {The entire process of creating and executing applications that solve interesting problems with acceptable cost and accuracy involves a complex interaction among hardware, system software, programming environments, mathematical software libraries, and applications software, all mediated by standards for arithmetic, operating systems, and programming environments. This panel will discuss various issues arising among these various contending points of view, sometimes from the point of view of issues raised during the current IEEE 754R standards revision effort.} }

@techreport{axb-itref-lawn, author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee}, title = {Error bounds from extra-precise iterative refinement}, type = {LAPACK Working Note}, institution = {Netlib}, year = {2005}, number = {165}, lawn = {165}, month = feb, dom = 3, role = {techreport}, tags = {lawn; lapack; linear algebra; floating point}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html}, note = {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and LBNL-56965; expanded from TOMS version}, url = {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf} }

@article{axb-itref-toms, author = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and Xiaoye S. Li and Sonil Mukherjee}, title = {Error bounds from extra-precise iterative refinement}, journal = {{ACM} Transactions on Mathematical Software}, year = {2006}, volume = {32}, number = {2}, pages = {325--351}, month = jun, role = {refereed}, tags = {acm; toms; lapack; floating point; linear algebra}, doi = {10.1145/1141885.1141894}, issn = {0098-3500}, mrclass = {65F10}, mrnumber = {2272365}, abstract = {We present the design and testing of an algorithm for iterative refinement of the solution of linear equations where the residual is computed with extra precision. This algorithm was originally proposed in 1948 and analyzed in the 1960s as a means to compute very accurate solutions to all but the most ill-conditioned linear systems. However, two obstacles have until now prevented its adoption in standard subroutine libraries like LAPACK: (1) There was no standard way to access the higher precision arithmetic needed to compute residuals, and (2) it was unclear how to compute a reliable error bound for the computed solution. The completion of the new BLAS Technical Forum Standard has essentially removed the first obstacle. To overcome the second obstacle, we show how the application of iterative refinement can be used to compute an error bound in any norm at small cost and use this to compute both an error bound in the usual infinity norm, and a componentwise relative error bound.}, gt-role = {Assisted in software development, experimental methods, writing, and editing.} }

@misc{bascd2002-poster, author = {E. Jason Riedy}, title = {Parallel Bipartite Matching for Sparse Matrix Computation}, howpublished = {Third Bay Area Scientific Computing Day}, month = mar, year = {2002}, role = {poster}, address = {Livermore, CA}, tags = {bascd; sparse matrix; combinatorial optimization; parallel algorithms} }

@misc{bascd2006-poster, author = {E. Jason Riedy}, title = {Making Static Pivoting Dependable}, howpublished = {Seventh Bay Area Scientific Computing Day}, month = mar, year = {2006}, role = {poster}, address = {Livermore, CA}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2006-poster.pdf}, tags = {bascd; sparse matrix; linear algebra}, abstract = {For sparse LU factorization, dynamic pivoting tightly couples symbolic and numerical computation. Dynamic structural changes limit parallel scalability. Demmel and Li use static pivoting in distributed SuperLU for performance, but intentionally perturbing the input may lead silently to erroneous results. Are there experimentally stable static pivoting heuristics that lead to a dependable direct solver? The answer is currently a qualified yes. Current heuristics fail on a few systems, but all failures are detectable. } }

@misc{bascd2007-poster, author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy and Meghana Vishvanath and David Vu}, ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li and Meghana Vishvanath and David Vu}, title = {Precise Solutions for Overdetermined Least Squares Problems}, howpublished = {Stanford 50 -- Eighth Bay Area Scientific Computing Day}, month = mar, year = {2007}, role = {poster}, address = {Stanford, CA}, tags = {bascd; least squares}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2007-poster.pdf}, abstract = {Linear least squares (LLS) fitting is the most widely used data modeling technique and is included in almost every data analysis system (e.g. spreadsheets). These software systems often give no feedback on the conditioning of the LLS problem or the floating-point calculation errors present in the solution. With limited use of extra precision, we can eliminate these concerns for all but the most ill-conditioned LLS problems. Our algorithm provides either a solution and residual with relatively tiny error or a notice that the LLS problem is too ill-conditioned.} }

@misc{cerfacs08, author = {E. Jason Riedy}, title = {Auctions for Distributed (and Possibly Parallel) Matchings}, howpublished = {Visit to \href{http://www.cerfacs.fr/}{CERFACS} courtesy of the Franco-Berkeley Fund}, dom = {17}, month = dec, year = {2008}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cerfacs08.pdf}, tags = {cerfacs; combinatorial optimization; sparse matrix}, role = {presentation} }

@inproceedings{dblp:conf/dimacs/riedymeb12, author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader}, title = {Parallel community detection for massive graphs}, booktitle = {Graph Partitioning and Graph Clustering}, year = {2013}, pages = {207--222}, ee = {http://www.ams.org/books/conm/588/11703}, editor = {David A. Bader and Henning Meyerhenke and Peter Sanders and Dorothea Wagner}, opttitle = {Graph Partitioning and Graph Clustering - 10th DIMACS Implementation Challenge Workshop, Georgia Institute of Technology, Atlanta, GA, USA, February 13-14, 2012. Proceedings}, publisher = {American Mathematical Society}, series = {Contemporary Mathematics}, volume = {588}, isbn = {978-0-8218-9038-7, 978-0-8218-9869-7}, bibsource = {DBLP, http://dblp.uni-trier.de}, doi = {10.1090/conm/588/11703} }

@incollection{dimacs10-workshop, author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader}, ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader}, title = {Parallel Community Detection for Massive Graphs}, booktitle = {10th DIMACS Implementation Challenge Workshop - Graph Partitioning and Graph Clustering}, tags = {parallel; graph; community detection}, publisher = {(workshop paper)}, year = 2012, month = feb, dom = 14, address = {Atlanta, Georgia}, note = {Won first place in the Mix Challenge and Mix Pareto Challenge}, url = {http://www.cc.gatech.edu/dimacs10/papers/[15]-dimacs10-community-detection.pdf} }

@unpublished{fp-type-project, author = {E. Jason Riedy}, title = {Type System Support for Floating-Point Computation}, month = may, dom = 25, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/type-support-for-fp.pdf}, abstract = {Floating-point arithmetic is often seen as untrustworthy. We show how manipulating precisions according to the following rules of thumb enhances the reliability of and removes surprises from calculations: Store data narrowly, compute intermediates widely, and derive properties widely. Further, we describe a typing system for floating point that both supports and is supported by these rules. A single type is established for all in- termediate computations. The type describes a precision at least as wide as all inputs to and results from the computation. Picking a single type provides benefits to users, compilers, and interpreters. The type system also extends cleanly to encompass intervals and higher precisions.}, year = {2001}, role = {unpublished}, tags = {programming language; floating point; ieee754} }

@unpublished{graph500-1.1, author = {David A. Bader and Jonathan Berry and Simon Kahan and Richard Murphy and E. Jason Riedy and Jeremiah Willcock}, ejr-withauthor = {David A. Bader and Jonathan Berry and Simon Kahan and Richard Murphy and Jeremiah Willcock}, title = {Graph 500 Benchmark 1 ("Search")}, note = {Version 1.1}, url = {http://www.graph500.org/Specifications.html}, month = oct, year = 2010 }

@article{graphct-tpds-2012, author = {David Ediger and Karl Jiang and Jason Riedy and David A. Bader}, ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader}, title = {GraphCT: Multithreaded Algorithms for Massive Graph Analysis}, journal = {{IEEE} Transactions in Parallel and Distributed Systems}, year = 2013, optkey = {}, optvolume = {}, optnumber = {}, optpages = {}, optmonth = {}, optannote = {}, doi = {10.1109/TPDS.2012.323}, issn = {1045-9219}, url = {http://dx.doi.org/10.1109/TPDS.2012.323}, role = {refereed}, gt-role = {Assisted in software development, experimental methods, writing, and editing.} }

@incollection{graphct-wiley-chap, author = {David Ediger and Jason Riedy and David A. Bader and Henning Meyerhenke}, ejr-withauthor = {David Ediger and David A. Bader and Henning Meyerhenke}, title = {Computational Graph Analytics for Massive Streaming Data}, booktitle = {Large Scale Network-Centric Computing Systems}, publisher = {Wiley}, month = jul, tags = {parallel; graph; streaming}, dom = 30, year = 2013, editor = {Hamid Sarbazi-azad and Albert Zomaya}, isbn = {978-0470936887}, series = {Parallel and Distributed Computing}, chapter = 25, role = {chapter} }

@misc{gt09, author = {E. Jason Riedy}, title = {Dependable direct solutions for linear systems using a little extra precision}, howpublished = {\href{http://cse.gatech.edu/}{CSE} Seminar at Georgia Institute of Technology}, dom = {21}, month = aug, year = 2009, url = {http://hdl.handle.net/1853/29795}, tags = {linear algebra; floating point; lapack}, role = {presentation}, abstract = {Solving a square linear system $Ax=b$ often is considered a black box. It's supposed to "just work," and failures often are blamed on the original data or subtleties of floating-point. Now that we have an abundance of cheap computations, however, we can do much better. A little extra precision in just the right places produces accurate solutions cheaply or demonstrates when problems are too hard to solve without significant cost. This talk will outline the method, iterative refinement with a new twist; the benefits, small backward and forward errors; and the trade-offs and unexpected benefits.} }

@inproceedings{ia-cost, author = {Joseph N. Wilson and E. Jason Riedy}, ejr-withauthor = {Joseph N. Wilson}, title = {Efficient {SIMD} evaluation of image processing programs}, booktitle = {Parallel and Distributed Methods for Image Processing}, pages = {199--210}, year = {1997}, month = jul, dom = 28, editor = {Hongchi Shi and Patrick C. Coffield}, volume = {3166}, address = {San Diego, CA}, organization = {SPIE}, role = {proceedings}, tags = {spie; image algebra; parallel algorithms}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/ia-cost.pdf}, doi = {10.1117/12.279618}, abstract = {SIMD parallel systems have been employed for image processing and computer vision applications since their inception. This paper describes a system in which parallel programs are implemented using a machine-independent, retargetable object library that provides SIMD execution on the Lockheed Martin PAL-I SIMD parallel processor. Programs' performance on this machine is improved through on-the-fly execution analysis and scheduling. We describe the relevant elements of the system structure, the general scheme for execution analysis, and the current cost model for scheduling.} }

@incollection{ia-simd-chapter, author = {Joseph N. Wilson and E. Jason Riedy and Gerhard X. Ritter and Hongchi Shi}, ejr-withauthor = {Joseph N. Wilson and Gerhard X. Ritter and Hongchi Shi}, editor = {C. W. Chen and Y. Q. Zhang}, booktitle = {Visual Information Representation, Communication, and Image Processing}, title = {An {Image} {Algebra} Based {SIMD} Image Processing Environment}, publisher = {Marcel Dekker}, year = 1999, address = {New York}, pages = {523--542}, citeseer = {wilson97image.html}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/ia-simd-chap.pdf}, isbn = {082471928X}, role = {chapter}, tags = {image algebra; parallel algorithms}, abstract = {SIMD parallel computers have been employed for image related applications since their inception. They have been leading the way in improving processing speed for those applications. However, current parallel programming technologies have not kept pace with the performance growth and cost decline of parallel hardware. A highly usable parallel software development environment is needed. This chapter presents a computing environment that integrates a SIMD mesh architecture with image algebra for high-performance image processing applications. The environment describes parallel programs through a machine-independent, retargetable image algebra object library that supports SIMD execution on the Lockheed Martin PAL-I parallel computer. Program performance on this machine is improved through on-the-fly execution analysis and scheduling. We describe the relevant elements of the system structure, outline the scheme for execution analysis, and provide examples of the current cost model and scheduling system.}, icon = {ia-simd-chap.wordle.png} }

@incollection{icassp2012-stinger, author = {Jason Riedy and Henning Meyerhenke and David A. Bader and David Ediger and Timothy G. Mattson}, ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader and Timothy G. Mattson}, booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, title = {Analysis of Streaming Social Networks and Graphs on Multicore Architectures}, year = {2012}, month = mar, dom = 29, address = {Kyoto, Japan}, url = {http://www.slideshare.net/jasonriedy/icassp-2012-analysis-of-streaming-social-networks-and-graphs-on-multicore-architectures}, abstract = {Analyzing static snapshots of massive, graph-structured data cannot keep pace with the growth of social networks, financial transactions, and other valuable data sources. We introduce a framework, STING (Spatio-Temporal Interaction Networks and Graphs), and evaluate its performance on multicore, multisocket Intel(R)-based platforms. STING achieves rates of around 100\,000 edge updates per second on large, dynamic graphs with a single, general data structure. We achieve speed-ups of up to 1000$\times$ over parallel static computation, improve monitoring a dynamic graph's connected components, and show an exact algorithm for maintaining local clustering coefficients performs better on Intel-based platforms than our earlier approximate algorithm.} }

@inproceedings{icpp10, author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds}, ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader and Courtney Corley and Rob Farber and William N. Reynolds}, title = {Massive Social Network Analysis: Mining Twitter for Social Good}, booktitle = {39th International Conference on Parallel Processing ({ICPP})}, role = {proceedings}, tags = {parallel; graph}, year = {2010}, address = {San Diego, CA}, month = sep, dom = 16, url = {http://www.cc.gatech.edu/~bader/papers/MassiveTwitter.html}, acc-note = {(70/225 papers accepted: 31.1\% acceptance rate)} }

@techreport{ieee754-2008, note = {(committee member and contributor)}, key = {IEEE Std 754-2008}, journal = {IEEE Std 754-2008}, type = {IEEE Std}, number = {754-2008}, title = {{IEEE} Standard for Floating-Point Arithmetic}, year = {2008}, pages = {1 -- 70}, institution = {Microprocessor Standards Committee of the IEEE Computer Society}, abstract = {This standard specifies interchange and arithmetic formats and methods for binary and decimal floating-point arithmetic in computer programming environments. This standard specifies exception conditions and their default handling. An implementation of a floating-point system conforming to this standard may be realized entirely in software, entirely in hardware, or in any combination of software and hardware. For operations specified in the normative part of this standard, numerical results and exceptions are uniquely determined by the values of the input data, sequence of operations, and destination formats, all under user control.}, keywords = {IEEE standards;floating point arithmetic;programming;IEEE standard;arithmetic formats;computer programming;decimal floating-point arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal}, doi = {10.1109/IEEESTD.2008.4610935}, isbn = {978-0-7381-5753-5}, address = {New York, NY}, month = aug, dom = 29, committee = {Alex Aiken and Matthew Applegate and David Bailey and Steve Bass and Dileep Bhandarkar and Mahesh Bhat and David Bindel and Sylvie Boldo and Stephen Canon and Steven R. Carlough and Marius Cornea and Mike Cowlishaw (editor) and John H. Crawford and Joseph D. Darcy and Marc Daumas and Bob Davis and Mark Davis and Dick Delp and Jim Demmel and Mark A. Erle and Hossam A. H. Fahmy and J.P. Fasano and Richard Fateman and Eric Feng and Warren E. Ferguson and Alex Fit-Florea and Laurent Fournier and Chip Freitag and Ivan Godard and Roger A. Golliver and David Gustafson and Michel Hack and John R. Harrison and John Hauser and Yozo Hida and Chris N. Hinds and Graydon Hoare and David G. Hough and Jerry Huck and Jim Hull and Michael Ingrassia and David V. James and Rick James and William Kahan and John Kapernick and Richard Karpinski and Jeff Kidder and Plamen Koev and Ren-Cang Li and Zhishun Alex Liu and Raymond Mak and Peter Markstein and David Matula and Guillaume Melquiond and Nobuyoshi Mori and Ricardo Morin and Ned Nedialkov and Craig Nelson and Stuart Oberman and Jon Okada and Ian Ollmann and Michael Parks and Tom Pittman and Eric Postpischil and Jason Riedy and Debjit Das Sarma and Eric M. Schwarz and David Scott and Don Senzig and Ilya Sharapov and Jim Shearer and Michael Siu and Ron Smith and Chuck Stevens and Peter Tang and Pamela J. Taylor and James W. Thomas and Brandon Thompson and Wendy Thrash and Neil Toda and Son Dao Trong and Leonard Tsai and Charles Tsen and Fred Tydeman and Liang Kai Wang and Scott Westbrook and Steve Winkler and Anthony Wood and Umit Yalcinalp and Fred Zemke and Paul Zimmermann and Dan Zuras (chair)} }

@misc{ieee754-exceptions, author = {David Bindel and E. Jason Riedy}, ejr-withauthor = {David Bindel}, title = {Exception Handling Interfaces, Implementations, and Evaluation}, howpublished = {IEEE-754r revision meeting}, month = aug, year = {2002}, url = {http://grouper.ieee.org/groups/754/meeting-materials/2002-08-22-pres.pdf}, role = {presentation}, tags = {ieee754; floating point} }

@misc{intel.graph.2011, author = {Jason Riedy and David A. Bader and Henning Meyerhenke and David Ediger and Timothy Mattson}, ejr-withauthor = {David A. Bader and Henning Meyerhenke and David Ediger and Timothy Mattson}, title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms}, howpublished = {Presentation at Intel Corporation, Santa Clara, CA}, dom = 9, month = aug, year = 2011, role = {presentation}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GT-STING-for-Intel-beamer.pdf} }

@misc{intel.graph.2012, author = {Jason Riedy and David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson}, ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson}, title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms}, howpublished = {Presentation at Intel Corporation, Santa Clara, CA}, dom = 24, month = jul, year = 2012, role = {presentation}, url = {http://www.slideshare.net/jasonriedy/gt-stingintelslides} }

@misc{intel.graph.2014, author = {Jason Riedy and David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson}, ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and Timothy G. Mattson}, title = {{STING}: Spatio-Temporal Interaction Networks and Graphs for {Intel} Platforms}, howpublished = {Presentation at Intel Corporation, Santa Clara, CA}, dom = 17, month = jan, year = 2014, role = {presentation}, url = {http://www.slideshare.net/jasonriedy/intel-20140117} }

@misc{lapack-future, author = {E. Jason Riedy and Yozo Hida and James W. Demmel}, ejr-withauthor = {Yozo Hida and James W. Demmel}, title = {The Future of {LAPACK} and {ScaLAPACK}}, howpublished = {Robert C. Thompson Matrix Meeting}, dom = {18}, month = nov, year = {2005}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/future-of-scalapack.pdf}, role = {presentation}, tags = {lapack; software engineering}, abstract = {We are planning new releases of the widely used LAPACK and ScaLAPACK numerical linear algebra libraries. Based on an on-going user survey (http://www.netlib.org/lapack-dev) and research by many people, we are proposing the following improvements: Faster algorithms (including better numerical methods, memory hierarchy optimizations, parallelism, and automatic performance tuning to accomodate new architectures), more accurate algorithms (including better numerical methods, and use of extra precision), expanded functionality (including updating and downdating, new eigenproblems, etc. and putting more of LAPACK into ScaLAPACK), and improved ease of use (friendlier interfaces in multiple languages). To accomplish these goals we are also relying on better software engineering techniques and contributions from collaborators at many institutions. This is joint work with Jack Dongarra.} }

@inproceedings{lapack-prospectus, author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries}, booktitle = {{PARA'06}: State-of-the-Art in Scientific and Parallel Computing}, year = {2006}, address = {Ume{\aa}, Sweden}, month = jun, organization = {High Performance Computing Center North ({HPC2N}) and the Department of Computing Science, Ume{\aa} University}, publisher = {Springer}, role = {proceedings}, tags = {lapack}, url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/para06-lapack.pdf}, abstract = {LAPACK and ScaLAPACK are widely used software libraries for numerical linear algebra. There have been over 68M web hits at www.netlib.org for the associated libraries LAPACK, ScaLAPACK, CLAPACK and LAPACK95. LAPACK and ScaLAPACK are used to solve leading edge science problems and they have been adopted by many vendors and software providers as the basis for their own libraries, including AMD, Apple (under Mac OS X), Cray, Fujitsu, HP, IBM, Intel, NEC, SGI, several Linux distributions (such as Debian), NAG, IMSL, the MathWorks (producers of MATLAB), Interactive Supercomputing, and PGI. Future improvements in these libraries will therefore have a large impact on users.} }

@techreport{lapack-prospectus-lawn, author = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford Parlett and W. Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye S. Li and Osni A. Marques and Christof V{\"o}mel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julie Langou and Stanimire Tomov}, title = {Prospectus for the Next {LAPACK} and {ScaLAPACK} Libraries}, institution = {Netlib}, year = {2007}, type = {LAPACK Working Note}, number = {181}, lawn = {181}, month = feb, note = {Also issued as UT-CS-07-592}, role = {techreport}, tags = {lawn; lapack}, url = {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf} }

@unpublished{lapack-style, author = {Jack Dongarra and Julien Langou and E. Jason Riedy}, ejr-withauthor = {Jack Dongarra and Julien Langou}, title = {Sca/{LAPACK} Program Style}, month = aug, year = {2006}, role = {unpublished}, tags = {lapack}, url = {http://www.netlib.org/lapack-dev/lapack-coding/program-style.html}, abstract = {The purpose of this document is to facilitate contributions to LAPACK and ScaLAPACK by documenting their design and implementation guidelines. The long-term goal is to provide guidelines for both LAPACK and ScaLAPACK. However, the parallel ScaLAPACK code has more open issues, so this document primarily concerns LAPACK.} }

@techreport{lawn188, author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li}, title = {Extra-precise iterative refinement for overdetermined least squares problems}, type = {LAPACK Working Note}, institution = {Netlib}, year = 2007, number = 188, month = may, dom = 31, url = {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html}, note = {Also issued as UCB/EECS-2007-77; version accepted for TOMS.}, abstract = {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.}, role = {techreport}, tags = {lawn; lapack; least squares; floating point} }

@article{lsq-itref-toms, author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li}, title = {Extra-precise iterative refinement for overdetermined least squares problems}, journal = {{ACM} Transactions on Mathematical Software}, volume = {35}, number = {4}, year = 2009, month = feb, issn = {0098-3500}, pages = {1--32}, doi = {10.1145/1462173.1462177}, accepted = {25 June 2008}, role = {refereed}, tags = {acm; toms; lapack; floating point; linear algebra}, abstract = {We present the algorithm, error bounds, and numerical results for extra-precise iterative refinement applied to overdetermined linear least squares (LLS) problems. We apply our linear system refinement algorithm to Bj{\"o}rck’s augmented linear system formulation of an LLS problem. Our algorithm reduces the forward normwise and componentwise errors to $O(\varepsilon)$ unless the system is too ill conditioned. In contrast to linear systems, we provide two separate error bounds for the solution $x$ and the residual $r$. The refinement algorithm requires only limited use of extra precision and adds only $O(mn)$ work to the $O(mn^2)$ cost of QR factorization for problems of size m-by-n. The extra precision calculation is facilitated by the new extended-precision BLAS standard in a portable way, and the refinement algorithm will be included in a future release of LAPACK and can be extended to the other types of least squares problems.}, gt-role = {Assisted in software development, experimental methods, writing, and editing.} }

@inproceedings{md11-graph, author = {David A. Bader and David Ediger and E. Jason Riedy}, ejr-withauthor = {David A. Bader and David Ediger}, title = {Parallel Programming for Graph Analysis}, booktitle = {full day tutorial}, role = {tutorial}, tags = {parallel; graph}, year = 2011, month = sep, dom = 28, address = {Columbia, MD} }

@inproceedings{mtaap10, author = {David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader}, ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader}, title = {Massive Streaming Data Analytics: A Case Study with Clustering Coefficients}, booktitle = {4th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, tags = {parallel; graph; streaming}, year = 2010, address = {Atlanta, GA}, month = apr, dom = 23, url = {http://www.cc.gatech.edu/~bader/papers/StreamingCC.html}, acc-note = {(11/22 papers accepted, 50\% acceptance rate)} }

@inproceedings{mtaap11, author = {David Ediger and E. Jason Riedy and David A. Bader and Henning Meyerhenke}, ejr-withauthor = {David Ediger and David A. Bader and Henning Meyerhenke}, title = {Tracking Structure of Streaming Social Networks}, booktitle = {5th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, tags = {parallel; graph; streaming}, year = 2011, month = may, abstract = {Current online social networks are massive and still growing. For example, Facebook has over 500 million active users sharing over 30 billion items per month. The scale within these data streams has outstripped traditional graph analysis methods. Monitoring requires dynamic analysis rather than repeated static analysis. The massive state behind multiple persistent queries requires shared data structures and not problem-specific representations. We present a framework based on the STINGER data structure that can monitor a global property, connected components, on a graph of 16 million vertices at rates of up to 240\,000 updates per second on a 32 processor Cray XMT. For very large scale-free graphs, our implementation uses novel batching techniques that exploit the scale-free nature of the data and run over three times faster than prior methods. Our framework handles, for the first time, real-world data rates, opening the door to higher-level analytics such as community and anomaly detection.}, acc-note = {(10/17 papers accepted, 59\% acceptance rate)} }

@inproceedings{mtaap12, author = {E. Jason Riedy and David A. Bader and Henning Meyerhenke}, ejr-withauthor = {David A. Bader and Henning Meyerhenke}, title = {Scalable Multi-threaded Community Detection in Social Networks}, booktitle = {6th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, tags = {parallel; graph; community detection}, year = 2012, month = may, dom = 25, abstract = {The volume of existing graph-structured data requires improved parallel tools and algorithms. Finding communities, smaller subgraphs densely connected within the subgraph than to the rest of the graph, plays a role both in developing new parallel algorithms as well as opening smaller portions of the data to current analysis tools. We improve performance of our parallel community detection algorithm by 20\% on the massively multithreaded Cray XMT, evaluate its performance on the next-generation Cray XMT2, and extend its reach to Intel-based platforms with OpenMP. To our knowledge, not only is this the first massively parallel community detection algorithm but also the only such algorithm that achieves excellent performance and good parallel scalability across all these platforms. Our implementation analyzes a moderate sized graph with 105 million vertices and 3.3 billion edges in around 500 seconds on a four processor, 80-logical-core Intel-based system and 1100 seconds on a 64-processor Cray XMT2.}, acc-note = {(9/15 papers accepted, 60\% acceptance)}, url = {http://www.slideshare.net/jasonriedy/mtaap12-scalable-community-detection} }

@inproceedings{mtaap13, author = {E. Jason Riedy and David A. Bader}, ejr-withauthor = {David A. Bader}, title = {Scalable Multi-threaded Community Detection in Social Networks}, booktitle = {7th Workshop on Multithreaded Architectures and Applications (MTAAP)}, role = {proceedings}, tags = {parallel; graph; streaming; community detection}, year = 2013, month = may, dom = 24, address = {Boston, MA}, acc-note = {(11/16 papers accepted, 69\% acceptance)} }

@techreport{nonneg-house-lawn, author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida}, title = {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {Householder} {$QR$}}, institution = {Netlib}, year = {2008}, type = {LAPACK Working Note}, number = {203}, lawn = {203}, month = may, dom = 30, note = {Also issued as UCB/EECS-2008-76; modified from SISC version.}, role = {techreport}, tags = {lawn; lapack; householder; qr}, url = {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html} }

@article{nonneg-house-lawn-sisc, author = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida and E. Jason Riedy}, ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo Hida}, title = {Non-Negative Diagonals and High Performance on Low-Profile Matrices from {H}ouseholder {$QR$}}, publisher = {SIAM}, year = 2009, month = jul, dom = 3, journal = {SIAM Journal on Scientific Computing}, volume = 31, number = 4, pages = {2832--2841}, keywords = {LAPACK; QR factorization; Householder reflection; floating-point}, doi = {10.1137/080725763}, role = {refereed}, tags = {siam; sisc; lapack; householder; qr}, issn = {1064-8275}, mrclass = {65F30}, mrnumber = {2520301}, gt-role = {Principal author and developer.} }

@unpublished{nsf-accel-workshop, ejr-withauthor = {workshop participants}, editor = {Viktor K. Prasanna and David A. Bader}, key = {Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications}, title = {{Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications}}, note = {This workshop is supported by NSF Grant Number 1051537, in response to the Call for Exploratory Workshop Proposals for Scientific Software Innovation Institutes (S2I2).}, dom = {31}, month = oct, year = 2010 }

@inproceedings{nsfaccelws10, author = {Jason Riedy and David Bader and David Ediger}, ejr-withauthor = {David Bader and David Ediger}, title = {Applications in Social Networks}, booktitle = {NSF Workshop on Accelerators for Data-Intensive Applications}, dom = {13}, year = {2010}, month = oct, role = {presentation}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/nsf-workshop-socnet.pdf}, tags = {graph; NSF; streaming} }

@unpublished{power-control, author = {E. Jason Riedy and Robert Szewczyk}, ejr-withauthor = {Robert Szewczyk}, title = {Power and Control in Networked Sensors}, note = {Cited}, month = may, dom = 11, year = {2000}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/power-and-control.pdf}, role = {unpublished}, tags = {sensor network}, abstract = {The fundamental constraint on a networked sensor is its energy consumption, since it may be either impossible or not feasible to replace its energy source. We analyze the power dissipation implications of implementing the network sensor with either a central processor switching between I/O devices or a family of processors, each dedicated to a single device. We present the energy measurements of the current generations of networked sensors, and develop an abstract description of tradeoffs between both designs.}, citeseer = {riedy00power.html} }

@misc{pp12-community-ms, author = {Henning Meyerhenke and E. Jason Riedy and David A. Bader}, ejr-withauthor = {Henning Meyerhenke and David A. Bader}, title = {Parallel Community Detection in Streaming Graphs}, howpublished = {SIAM Parallel Processing for Scientific Computing}, dom = {15}, month = feb, year = {2012}, role = {presentation}, tags = {siam; streaming data; parallel algorithms}, address = {Savannah, GA} }

@misc{pp12-graphct, author = {David Ediger and E. Jason Riedy and Henning Meyerhenke and David A. Bader}, eir-withauthor = {David Ediger and Henning Meyerhenke and David A. Bader}, title = {Analyzing Massive Networks with GraphCT}, howpublished = {SIAM Parallel Processing for Scientific Computing}, dom = {16}, month = feb, year = {2012}, role = {poster}, tags = {siam; parallel algorithms}, address = {Savannah, GA} }

@misc{pp12-sting, author = {E. Jason Riedy and David Ediger and Henning Meyerhenke and David A. Bader}, eir-withauthor = {David Ediger and Henning Meyerhenke and David A. Bader}, title = {{STING}: Software for Analysis of Spatio-Temporal Interaction Networks and Graphs}, howpublished = {SIAM Parallel Processing for Scientific Computing}, dom = {16}, month = feb, year = {2012}, role = {poster}, tags = {siam; parallel algorithms}, address = {Savannah, GA} }

@misc{pp12-streaming-ms, author = {E. Jason Riedy and Henning Meyerhenke}, ejr-withauthor = {Henning Meyerhenke}, title = {Scalable Algorithms for Analysis of Massive, Streaming Graphs}, howpublished = {SIAM Parallel Processing for Scientific Computing}, dom = {15}, month = feb, year = {2012}, role = {presentation}, url = {http://www.slideshare.net/jasonriedy/siam-pp-2012-scalable-algorithms-for-analysis-of-massive-streaming-graphs}, tags = {siam; streaming data; parallel algorithms}, address = {Savannah, GA} }

@inproceedings{ppam11, author = {E. Jason Riedy and Henning Meyerhenke and David Ediger and David A. Bader}, ejr-withauthor = {Henning Meyerhenke and David Ediger and David A. Bader}, title = {Parallel Community Detection for Massive Graphs}, booktitle = {9th International Conference on Parallel Processing and Applied Mathematics (PPAM11)}, year = 2011, month = sep, publisher = {Springer}, role = {proceedings}, tags = {parallel; graph; community detection}, abstract = {Tackling the current volume of graph-structured data requires parallel tools. We extend our work on analyzing such massive graph data with the first massively parallel algorithm for community detection that scales to current data sizes, scaling to graphs of over 122 million vertices and nearly 2 billion edges in under 7300 seconds on a massively multithreaded Cray XMT. Our algorithm achieves moderate parallel scalability without sacrificing sequential operational complexity. Community detection partitions a graph into subgraphs more densely connected within the subgraph than to the rest of the graph. We take an agglomerative approach similar to Clauset, Newman, and Moore's sequential algorithm, merging pairs of connected intermediate subgraphs to optimize different graph properties. Working in parallel opens new approaches to high performance. On smaller data sets, we find the output's modularity compares well with the standard sequential algorithms.}, acc-note = {(134/243 papers accepted, 55\% acceptance rate)} }

@inproceedings{ppopp11-graph, author = {David A. Bader and David Ediger and E. Jason Riedy}, ejr-withauthor = {David A. Bader and David Ediger}, title = {Parallel Programming for Graph Analysis}, booktitle = {16th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (PPoPP)}, role = {tutorial}, tags = {parallel; graph}, year = 2011, month = feb, dom = 12, address = {San Antonio, TX}, url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2011.html}, abstract = {An increasingly fast-paced, digital world has produced an ever-growing volume of petabyte-sized datasets. At the same time, terabytes of new, unstructured data arrive daily. As the desire to ask more detailed questions about these massive streams has grown, parallel software and hardware have only recently begun to enable complex analytics in this non-scientific space. In this tutorial, we will discuss the open problems facing us with analyzing this "data deluge". We will present algorithms and data structures capable of analyzing spatio-temporal data at massive scale on parallel systems. We will try to understand the difficulties and bottlenecks in parallel graph algorithm design on current systems and will show how multithreaded and hybrid systems can overcome these challenges. We will demonstrate how parallel graph algorithms can be implemented on a variety of architectures using different programming models. The goal of this tutorial is to provide a comprehensive introduction to the field of parallel graph analysis to an audience with computing background, interested in participating in research and/or commercial applications of this field. Moreover, we will cover leading-edge technical and algorithmic developments in the field and discuss open problems and potential solutions.} }

@inproceedings{ppopp12-graph, author = {David Ediger and Jason Riedy and Rob McColl and David A. Bader}, ejr-withauthor = {David Ediger and Rob McColl and David A. Bader}, title = {Parallel Programming for Graph Analysis}, booktitle = {17th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (PPoPP)}, role = {tutorial}, tags = {parallel; graph}, year = 2012, month = feb, dom = 26, address = {New Orleans, LA}, url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2012.html}, abstract = {An increasingly fast-paced, digital world has produced an ever-growing volume of petabyte-sized datasets. At the same time, terabytes of new, unstructured data arrive daily. As the desire to ask more detailed questions about these massive streams has grown, parallel software and hardware have only recently begun to enable complex analytics in this non-scientific space. In this tutorial, we will discuss the open problems facing us with analyzing this "data deluge". We will present algorithms and data structures capable of analyzing spatio-temporal data at massive scale on parallel systems. We will try to understand the difficulties and bottlenecks in parallel graph algorithm design on current systems and will show how multithreaded and hybrid systems can overcome these challenges. We will demonstrate how parallel graph algorithms can be implemented on a variety of architectures using different programming models. The goal of this tutorial is to provide a comprehensive introduction to the field of parallel graph analysis to an audience with computing background, interested in participating in research and/or commercial applications of this field. Moreover, we will cover leading-edge technical and algorithmic developments in the field and discuss open problems and potential solutions.} }

@incollection{rwp10, author = {E. Jason Riedy}, editor = {Dana Martin Guthrie}, booktitle = {Read Write Poem NaPoWriMo Anthology}, title = {Here, on the farthest point of the peninsula}, publisher = {\href{http://issuu.com}{issuu.com}}, year = {2010}, month = sep, dom = {15}, pages = {86}, url = {http://issuu.com/readwritepoem/docs/read_write_poem_napowrimo_anthology}, myurl = {http://lovesgoodfood.com/jason/posts/post-0099/}, role = {poetry}, tags = {beach; napowrimo; poetry; rwp} }

@techreport{seed-set-tr, author = {Riedy, Jason and Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa}, ejr-withauthor = {Bader, David A. and Jiang, Karl and Pande, Pushkar and Sharma, Richa}, title = {Detecting Communities from Given Seeds in Social Networks}, institution = {Georgia Institute of Technology}, year = 2011, number = {GT-CSE-11-01}, month = feb, dom = 22, url = {http://hdl.handle.net/1853/36980}, role = {techreport}, abstract = {Analyzing massive social networks challenges both high-performance computers and human understanding. These massive networks cannot be visualized easily, and their scale makes applying complex analysis methods computationally expensive. We present a region-growing method for finding a smaller, more tractable subgraph, a community, given a few example seed vertices. Unlike existing work, we focus on a small number of seed vertices, from two to a few dozen. We also present the first comparison between five algorithms for expanding a small seed set into a community. Our comparison applies these algorithms to an R-MAT generated graph component with 240 thousand vertices and 32 million edges and evaluates the community size, modularity, Kullback-Leibler divergence, conductance, and clustering coefficient. We find that our new algorithm with a local modularity maximizing heuristic based on Clauset, Newman, and Moore performs very well when the output is limited to 100 or 1000 vertices. When run without a vertex size limit, a heuristic from McCloskey and Bader generates communities containing around 60\% of the graph's vertices and having a small conductance and modularity appropriate to the result size. A personalized PageRank algorithm based on Andersen, Lang, and Chung also performs well with respect to our metrics.}, tags = {graph; social network} }

@misc{siam-cse03, author = {E. Jason Riedy}, title = {Parallel Bipartite Matching for Sparse Matrix Computations}, howpublished = {SIAM Conference on Computational Science and Engineering}, month = feb, year = {2003}, role = {poster}, tags = {siam; parallel algorithms; combinatorial optimization; sparse matrix}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-cse03-poster.pdf} }

@misc{siamcse13-largescalegraph, author = {David A. Bader and Henning Meyerhenke and Jason Riedy}, ejr-withauthor = {David A. Bader and Henning Meyerhenke}, title = {Applications and Challenges in Large-scale Graph Analysis}, howpublished = {SIAM Conference on Computational Science and Engineering}, month = feb, year = 2013, address = {Boston, MA}, role = {presentation}, tags = {siam; parallel algorithms} }

@misc{siamcse13-streaminggraph, author = {Robert C. McColl and David Ediger and David A. Bader and Jason Riedy}, ejr-withauthor = {Robert C. McColl and David Ediger and David A. Bader}, title = {Analyzing Graph Structure in Streaming Data with {STINGER}}, howpublished = {SIAM Conference on Computational Science and Engineering}, month = feb, year = 2013, address = {Boston, MA}, role = {presentation}, tags = {siam; streaming data; parallel algorithms} }

@incollection{smallstone10, author = {Jason Riedy}, editor = {Fiona Robyn and Kaspalita}, booktitle = {pay attention: a river of stones}, title = {The storm's coming when the chickens spread out}, publisher = {\href{http://lulu.com}{lulu.com}}, year = {2011}, month = mar, dom = {2}, pages = 77, myurl = {http://lovesgoodfood.com/jason/posts/river-of-stones-7/}, url = {http://www.lulu.com/product/file-download/pay-attention-a-river-of-stones/15057101}, role = {poetry}, tags = {poetry; aros; riverofstones} }

@misc{sparse-ds-csc04, author = {E. Jason Riedy}, title = {Sparse Data Structures for Weighted Bipartite Matching}, howpublished = {SIAM Workshop on Combinatorial Scientific Computing}, dom = {28}, month = feb, year = {2004}, role = {presentation}, tags = {siam; combinatorial optimization; sparse matrix}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/csc04.pdf} }

@inproceedings{stinger-hpec12, author = {David Ediger and Robert McColl and Jason Riedy and David A. Bader}, ejr-withauthor = {David Ediger and Robert McColl and David A. Bader}, title = {{STINGER}: High Performance Data Structure for Streaming Graphs}, tags = {parallel; graph; streaming}, booktitle = {The IEEE High Performance Extreme Computing Conference (HPEC)}, year = 2012, month = sep, address = {Waltham, MA}, note = {Best paper award}, dom = 12, role = {proceedings} }

@unpublished{tera-ubench, author = {E. Jason Riedy and Rich Vuduc}, ejr-withauthor = {Rich Vuduc}, url = {http://purl.oclc.org/NET/jason-riedy/resume/material/Tera.pdf}, title = {Microbenchmarking the {Tera} {MTA}}, note = {Cited, \href{http://purl.oclc.org/NET/jason-riedy/resume/material/Tera-presentation.pdf}{presentation version} available}, other-url = {http://purl.oclc.org/NET/jason-riedy/resume/material/Tera-presentation.pdf}, dom = 21, month = may, year = {1999}, abstract = {The Tera Multithreaded Architecture, or MTA, addresses scalable shared memory system design with a difierent approach; it tolerates latency through providing fast access to multiple threads of execution. The MTA employs a number of radical design ideas: creation of hardware threads (streams) with frequent context switching; full-empty bits for each memory word; a flat memory hierarchy; and deep pipelines. Recent evaluations of the MTA have taken a top-down approach: port applications and application benchmarks, and compare the absolute performance with conventional systems. While useful, these studies do not reveal the effect of the Tera MTA's unique hardware features on an application. We present a bottom-up approach to the evaluation of the MTA via a suite of microbenchmarks to examine in detail the underlying hardware mechanisms and the cost of runtime system support for multithreading. In particular, we measure memory, network, and instruction latencies; memory bandwidth; the cost of low-level synchronization via full-empty bits; overhead for stream management; and the effects of software pipelining. These data should provide a foundation for performance modeling on the MTA. We also present results for list ranking on the MTA, an application which has traditionally been difficult to scale on conventional parallel systems.}, role = {unpublished}, tags = {parallel programming; parallel algorithms; multithreaded; computer architecture; cray} }

@techreport{tridiag-lawn, author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel}, ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel}, title = {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers}, type = {LAPACK Working Note}, number = {172}, lawn = {172}, institution = {Netlib}, month = sep, year = {2005}, dom = 30, note = {Also issued as UCB//CSD-05-1414; expanded from SISC version}, url = {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf}, other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html}, role = {techreport}, tags = {lawn; lapack; floating point; ieee754; eigenvalue} }

@article{tridiag-sisc, author = {Osni A. Marques and E. Jason Riedy and Christof V{\"o}mel}, ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel}, title = {Benefits of {IEEE-754} Features in Modern Symmetric Tridiagonal Eigensolvers}, journal = {SIAM Journal on Scientific Computing}, year = {2006}, month = sep, dom = 28, volume = {28}, number = {5}, pages = {1613--1633}, role = {refereed}, tags = {siam; sisc; floating point; eigenvalue; ieee754}, doi = {10.1137/050641624}, issn = {1064-8275}, mrclass = {65F15}, mrnumber = {2272181}, abstract = {Bisection is one of the most common methods used to compute the eigenvalues of symmetric tridiagonal matrices. Bisection relies on the Sturm count: For a given shift sigma, the number of negative pivots in the factorization $T - \sigma I = LDL^T$ equals the number of eigenvalues of T that are smaller than sigma. In IEEE-754 arithmetic, the value $\infty$ permits the computation to continue past a zero pivot, producing a correct Sturm count when $T$ is unreduced. Demmel and Li showed [IEEE Trans. Comput., 43 (1994), pp. 983–992] that using $\infty$ rather than testing for zero pivots within the loop could significantly improve performance on certain architectures. When eigenvalues are to be computed to high relative accuracy, it is often preferable to work with $LDL^T$ factorizations instead of the original tridiagonal $T$. One important example is the MRRR algorithm. When bisection is applied to the factored matrix, the Sturm count is computed from $LDL^T$ which makes differential stationary and progressive qds algorithms the methods of choice. While it seems trivial to replace $T$ by $LDL^T$, in reality these algorithms are more complicated: In IEEE-754 arithmetic, a zero pivot produces an overflow followed by an invalid exception (NaN, or ``Not a Number'') that renders the Sturm count incorrect. We present alternative, safe formulations that are guaranteed to produce the correct result. Benchmarking these algorithms on a variety of platforms shows that the original formulation without tests is always faster provided that no exception occurs. The transforms see speed-ups of up to 2.6x over the careful formulations. Tests on industrial matrices show that encountering exceptions in practice is rare. This leads to the following design: First, compute the Sturm count by the fast but unsafe algorithm. Then, if an exception occurs, recompute the count by a safe, slower alternative. The new Sturm count algorithms improve the speed of bisection by up to 2x on our test matrices. Furthermore, unlike the traditional tiny-pivot substitution, proper use of IEEE-754 features provides a careful formulation that imposes no input range restrictions.}, gt-role = {Principal author and developer.} }

@inproceedings{wssspe1, author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and Jason Riedy and David Bader and Richard Vuduc}, ejr-withauthor = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna and Manish Parashar and David Bader and Richard Vuduc}, title = {Sustainable Software Development for Next-Gen Sequencing (NGS) Bioinformatics on Emerging Platforms}, booktitle = {First Workshop on Sustainable Software for Science: Practice and Experiences (WSSSPE1)}, year = 2013, month = nov, dom = 17, address = {Denver, CO}, note = {held in conjunction with SC13, published electronically (\url{http://wssspe.researchcomputing.org.uk/})}, url = {http://arxiv.org/abs/1309.1828} }

*This file was generated by
bibtex2html 1.97.*