1. Home page
  2. BibTeX

Publications, presentations, and other written artifacts

ejr.bib

@comment{{This file has been generated by bib2bib 1.99}}
@comment{{Command line: /usr/bin/bib2bib -s $date ejr-refereed.bib ejr-bookish.bib ejr-proceedings.bib ejr-invited-pres.bib ejr-tutorials.bib ejr-tr.bib ejr-pres.bib ejr-unpub.bib ejr-other.bib}}
@inproceedings{ia-cost,
  author = {Joseph N. Wilson and E. Jason Riedy},
  ejr-withauthor = {Joseph N. Wilson},
  title = {Efficient {SIMD} evaluation of image processing
                  programs},
  booktitle = {Parallel and Distributed Methods for Image
                  Processing},
  pages = {199--210},
  year = {1997},
  month = jul,
  dom = 28,
  editor = {Hongchi Shi and Patrick C. Coffield},
  volume = {3166},
  address = {San Diego, CA},
  organization = {SPIE},
  role = {proceedings},
  tags = {spie; image algebra; parallel algorithms},
  file = {material/ia-cost.pdf},
  doi = {10.1117/12.279618},
  abstract = {SIMD parallel systems have been employed for image
                  processing and computer vision applications since
                  their inception.  This paper describes a system in
                  which parallel programs are implemented using a
                  machine-independent, retargetable object library
                  that provides SIMD execution on the Lockheed Martin
                  PAL-I SIMD parallel processor. Programs' performance
                  on this machine is improved through on-the-fly
                  execution analysis and scheduling. We describe the
                  relevant elements of the system structure, the
                  general scheme for execution analysis, and the
                  current cost model for scheduling.},
  projtag = {image-algebra}
}
@incollection{ia-simd-chapter,
  file = {material/ia-simd-chap.pdf},
  title = {An {Image} {Algebra} Based {SIMD} Image Processing
                  Environment},
  author = {Joseph N. Wilson and E. Jason Riedy and Gerhard
                  X. Ritter and Hongchi Shi},
  booktitle = {Visual Information Representation, Communication,
                  and Image Processing},
  publisher = {Marcel Dekker},
  year = {1999},
  address = {New York},
  editor = {C. W. Chen and Y. Q. Zhang},
  pages = {523--542},
  abstract = {SIMD parallel computers have been employed for image
                  related applications since their inception. They
                  have been leading the way in improving processing
                  speed for those applications. However, current
                  parallel programming technologies have not kept pace
                  with the performance growth and cost decline of
                  parallel hardware. A highly usable parallel software
                  development environment is needed. This chapter
                  presents a computing environment that integrates a
                  SIMD mesh architecture with image algebra for
                  high-performance image processing applications. The
                  environment describes parallel programs through a
                  machine-independent, retargetable image algebra
                  object library that supports SIMD execution on the
                  Lockheed Martin PAL-I parallel computer. Program
                  performance on this machine is improved through
                  on-the-fly execution analysis and scheduling. We
                  describe the relevant elements of the system
                  structure, outline the scheme for execution
                  analysis, and provide examples of the current cost
                  model and scheduling system.},
  optciteseer = {wilson97image.html},
  ejr-withauthor = {Joseph N. Wilson and Gerhard X. Ritter and Hongchi
                  Shi},
  isbn = {082471928X},
  role = {chapter},
  tags = {image algebra; parallel algorithms},
  file = {material/ia-simd-chap.pdf},
  projtag = {image-algebra}
}
@unpublished{tera-ubench,
  author = {E. Jason Riedy and Rich Vuduc},
  ejr-withauthor = {Rich Vuduc},
  file = {material/Tera.pdf},
  title = {Microbenchmarking the {Tera} {MTA}},
  note = {Cited},
  other-url = {http://purl.oclc.org/NET/jason-riedy/resume/material/Tera-presentation.pdf},
  dom = 21,
  month = may,
  year = {1999},
  abstract = {The Tera Multithreaded Architecture, or MTA,
                  addresses scalable shared memory system design with
                  a difierent approach; it tolerates latency through
                  providing fast access to multiple threads of
                  execution. The MTA employs a number of radical
                  design ideas: creation of hardware threads (streams)
                  with frequent context switching; full-empty bits for
                  each memory word; a flat memory hierarchy; and deep
                  pipelines. Recent evaluations of the MTA have taken
                  a top-down approach: port applications and
                  application benchmarks, and compare the absolute
                  performance with conventional systems. While useful,
                  these studies do not reveal the effect of the Tera
                  MTA's unique hardware features on an application. We
                  present a bottom-up approach to the evaluation of
                  the MTA via a suite of microbenchmarks to examine in
                  detail the underlying hardware mechanisms and the
                  cost of runtime system support for
                  multithreading. In particular, we measure memory,
                  network, and instruction latencies; memory
                  bandwidth; the cost of low-level synchronization via
                  full-empty bits; overhead for stream management; and
                  the effects of software pipelining. These data
                  should provide a foundation for performance modeling
                  on the MTA. We also present results for list ranking
                  on the MTA, an application which has traditionally
                  been difficult to scale on conventional parallel
                  systems.},
  role = {unpublished},
  tags = {parallel programming; parallel algorithms; multithreaded; computer architecture; cray},
  projtag = {cassmt}
}
@unpublished{power-control,
  author = {E. Jason Riedy and Robert Szewczyk},
  ejr-withauthor = {Robert Szewczyk},
  title = {Power and Control in Networked Sensors},
  note = {Cited},
  month = may,
  dom = 11,
  year = {2000},
  file = {material/power-and-control.pdf},
  role = {unpublished},
  tags = {sensor network},
  abstract = {The fundamental constraint on a networked sensor is
                  its energy consumption, since it may be either
                  impossible or not feasible to replace its energy
                  source.  We analyze the power dissipation
                  implications of implementing the network sensor with
                  either a central processor switching between I/O
                  devices or a family of processors, each dedicated to
                  a single device.  We present the energy measurements
                  of the current generations of networked sensors, and
                  develop an abstract description of tradeoffs between
                  both designs.},
  citeseer = {riedy00power.html}
}
@unpublished{fp-type-project,
  author = {E. Jason Riedy},
  title = {Type System Support for Floating-Point Computation},
  month = may,
  dom = 25,
  file = {material/type-support-for-fp.pdf},
  abstract = {Floating-point arithmetic is often seen as
                  untrustworthy. We show how manipulating precisions
                  according to the following rules of thumb enhances
                  the reliability of and removes surprises from
                  calculations: Store data narrowly, compute
                  intermediates widely, and derive properties
                  widely.  Further, we describe a typing system for
                  floating point that both supports and is supported
                  by these rules. A single type is established for all
                  in- termediate computations. The type describes a
                  precision at least as wide as all inputs to and
                  results from the computation. Picking a single type
                  provides benefits to users, compilers, and
                  interpreters. The type system also extends cleanly
                  to encompass intervals and higher precisions.},
  year = {2001},
  role = {unpublished},
  tags = {programming language; floating point; ieee754},
  projtag = {ieee754}
}
@misc{bascd2002-poster,
  author = {E. Jason Riedy},
  title = {Parallel Bipartite Matching for Sparse Matrix Computation},
  howpublished = {Third Bay Area Scientific Computing Day},
  month = mar,
  year = {2002},
  role = {poster},
  address = {Livermore, CA},
  role = {poster},
  tags = {bascd; sparse matrix; combinatorial optimization; parallel algorithms},
  projtag = {sparse-methods}
}
@misc{ieee754-exceptions,
  author = {David Bindel and E. Jason Riedy},
  ejr-withauthor = {David Bindel},
  title = {Exception Handling Interfaces, Implementations, and
                  Evaluation},
  howpublished = {IEEE-754r revision meeting},
  month = aug,
  year = {2002},
  url = {http://grouper.ieee.org/groups/754/meeting-materials/2002-08-22-pres.pdf},
  role = {presentation},
  tags = {ieee754; floating point},
  file = {material/ieee754-2002-08-22-pres.pdf},
  projtag = {ieee754}
}
@misc{siam-cse03,
  author = {E. Jason Riedy},
  title = {Parallel Bipartite Matching for Sparse Matrix
                  Computations},
  howpublished = {SIAM Conference on Computational Science and
                  Engineering},
  month = feb,
  year = {2003},
  role = {poster},
  tags = {siam; parallel algorithms; combinatorial
                  optimization; sparse matrix},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-cse03-poster.pdf},
  file = {material/siam-cse03-poster.pdf},
  abstract = {Practical and efficient methods exist for
                  parallelizing the numerical work in sparse matrix
                  calculations. The initial symbolic analysis is now
                  becoming a sequential bottleneck, limiting problems'
                  sizes. One such analysis is the weighted bipartite
                  matching used to achieve scalable, unsymmetric $LU$
                  factorization in Super\textsc{lu}. Applying a
                  mathematical optimization algorithm produces a
                  distributed-memory implementation with explicit
                  trade-offs between speed and matching quality. We
                  present accuracy and performance results for this
                  phase alone and in the context of Super\textsc{lu}.},
  projtag = {sparse-methods}
}
@misc{siam-am03,
  author = {E. Jason Riedy},
  title = {Practical Alternatives for Parallel Pivoting},
  howpublished = {SIAM Annual Meeting},
  month = jun,
  year = {2003},
  role = {presentation},
  tags = {siam; sparse matrix; linear algebra},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/siam-am03.pdf},
  file = {material/siam-am03.pdf},
  abstract = {Traditional pivoting during parallel, unsymmetric $LU$
                  factorization introduces heavy communication and
                  restructuring costs. Possible alternatives include
                  pre-pivoting to place heavy elements along the
                  diagonal and limited pivoting that maintains the
                  factors' structures. Each alternative comes with
                  trade-offs that affect accuracy and performance.},
  projtag = {sparse-methods}
}
@misc{sparse-ds-csc04,
  author = {E. Jason Riedy},
  title = {Sparse Data Structures for Weighted Bipartite Matching},
  howpublished = {SIAM Workshop on Combinatorial Scientific
                  Computing},
  dom = {28},
  month = feb,
  year = {2004},
  role = {presentation},
  tags = {siam; combinatorial optimization; sparse matrix},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/csc04.pdf},
  file = {material/csc04.pdf},
  projtag = {sparse-methods}
}
@misc{par-bipart-pp04,
  author = {E. Jason Riedy},
  title = {Parallel Weighted Bipartite Matching and
                  Applications},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = {27},
  month = feb,
  year = {2004},
  role = {minisymposium speaker},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/pp04.pdf},
  file = {material/pp04.pdf},
  role = {presentation},
  tags = {siam; combinatorial optimization; parallel
                  algorithms; sparse matrix},
  abstract = {Bipartite matching is one of graph theory's
                  workhorses, occuring in the solution or
                  approximation of many problems. Increasingly,
                  applications' data spans multiple memory spaces, but
                  there is little recent experience with distributed
                  matching algorithms. We present a distributed,
                  parallel implementation for weighted bipartite
                  matching based on Bertsekas's auction algorithm. The
                  bidding process finds local matchings while
                  summarizing updates for occasional communication,
                  leading to superlinear speed-ups on some sparse
                  problems and modest performance on others.},
  projtag = {sparse-methods}
}
@techreport{axb-itref-lawn,
  author = {James W. Demmel and Yozo Hida and W. Kahan and
                  Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and
                  Xiaoye S. Li and Sonil Mukherjee},
  title = {Error bounds from extra-precise iterative
                  refinement},
  type = {LAPACK Working Note},
  institution = {Netlib},
  year = {2005},
  number = {165},
  lawn = {165},
  month = feb,
  dom = 3,
  role = {techreport},
  tags = {lawn; lapack; linear algebra; floating point},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note = {Also issued as UCB//CSD-05-1414, UT-CS-05-547, and
                  LBNL-56965; expanded from TOMS version},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn165.pdf},
  file = {material/lawn165.pdf},
  abstract = {We present the design and testing of an algorithm
                  for iterative refinement of the solution of linear
                  equations, where the residual is computed with extra
                  precision. This algorithm was originally proposed in
                  the 1960s [6, 22] as a means to compute very
                  accurate solutions to all but the most
                  ill-conditioned linear systems of equations. However
                  two obstacles have until now prevented its adoption
                  in standard subroutine libraries like LAPACK: (1)
                  There was no standard way to access the higher
                  precision arithmetic needed to compute residuals,
                  and (2) it was unclear how to compute a reliable
                  error bound for the computed solution. The
                  completion of the new BLAS Technical Forum Standard
                  [5] has recently removed the first obstacle. To
                  overcome the second obstacle, we show how a single
                  application of iterative refinement can be used to
                  compute an error bound in any norm at small cost,
                  and use this to compute both an error bound in the
                  usual infinity norm, and a componentwise relative
                  error bound.  We report extensive test results on
                  over 6.2 million matrices of dimension 5, 10, 100,
                  and 1000.  As long as a normwise
                  (resp. componentwise) condition number computed by
                  the algorithm is less than $1 /
                  \operatorname{max}\{10, \sqrt{n}\}\varepsilon_w$ ,
                  the computed normwise (resp. componentwise) error
                  bound is at most $2 \operatorname{max}\{10,
                  \sqrt{n}\} \cdot \varepsilon_w$ , and indeed bounds
                  the true error. Here, $n$ is the matrix dimension
                  and $\varepsilon_w$ is single precision roundoff
                  error. For worse conditioned problems, we get
                  similarly small correct error bounds in over 89.4\%
                  of cases.},
  projtag = {lapack, ieee754}
}
@misc{comb-sparse-cse05,
  author = {E. Jason Riedy},
  title = {Parallel Combinatorial Computing and Sparse
                  Matrices},
  howpublished = {SIAM Conference on Computational Science and
                  Engineering},
  dom = {14},
  month = feb,
  year = {2005},
  role = {minisymposium speaker},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cse05.pdf},
  file = {material/cse05.pdf},
  role = {presentation},
  tags = {combinatorial optimization; sparse matrix; parallel
                  algorithms; siam},
  abstract = {Increasingly, sparse matrix applications produce
                  matrices too large for a single computer's
                  memory. Distributed, parallel computers provide an
                  avenue around memory limitations, but distributing
                  combinatorial algorithms is historically
                  difficult. We use insights from combinatorial
                  optimization to design loosely coupled algorithms
                  for sparse matrix matching, ordering, and symbolic
                  factorization. These algorithms' performance depends
                  on both problem instance and computer
                  architecture. We investigate these aspects of
                  performance and demonstrate issues that affect
                  distributed combinatorial computing.},
  projtag = {sparse-methods}
}
@inproceedings{arith-lang,
  author = {David Hough and Bill Hay and Jeff Kidder and
                  E. Jason Riedy and Guy L. Steele Jr. and Jim Thomas},
  ejr-withauthor = {David Hough and Bill Hay and Jeff Kidder and Guy
                  L. Steele Jr. and Jim Thomas},
  title = {Arithmetic Interactions: From Hardware to
                  Applications},
  booktitle = {17th {IEEE} Symposium on Computer Arithmetic
                  ({ARITH}'05)},
  year = {2005},
  dom = 28,
  month = jun,
  note = {See
                  \href{http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf}{related
                  presentation}},
  isbn = {0-7695-2366-8},
  role = {proceedings; panel},
  tags = {ieee754; floating point},
  doi = {10.1109/ARITH.2005.10},
  abstract = {The entire process of creating and executing
                  applications that solve interesting problems with
                  acceptable cost and accuracy involves a complex
                  interaction among hardware, system software,
                  programming environments, mathematical software
                  libraries, and applications software, all mediated
                  by standards for arithmetic, operating systems, and
                  programming environments. This panel will discuss
                  various issues arising among these various
                  contending points of view, sometimes from the point
                  of view of issues raised during the current IEEE
                  754R standards revision effort.},
  projtag = {ieee754}
}
@misc{lang-tools-ieee754,
  author = {E. Jason Riedy},
  title = {Modern Language Tools and {754R}},
  howpublished = {{ARITH}'05},
  month = jun,
  dom = 28,
  year = {2005},
  optrole = {panel participant},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/arith17-slides.pdf},
  file = {material/arith17-slides.pdf},
  optrole = {presentation},
  tags = {ieee754; floating point},
  note = {Invited presentation and panelist},
  projtag = {lapack, ieee754}
}
@techreport{tridiag-lawn,
  author = {Osni A. Marques and E. Jason Riedy and Christof
                  V{\"o}mel},
  ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel},
  title = {Benefits of {IEEE-754} Features in Modern Symmetric
                  Tridiagonal Eigensolvers},
  type = {LAPACK Working Note},
  number = {172},
  lawn = {172},
  institution = {Netlib},
  month = sep,
  year = {2005},
  dom = 30,
  note = {Also issued as UCB//CSD-05-1414; expanded from SISC
                  version},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn172.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2005/6514.html},
  role = {techreport},
  tags = {lawn; lapack; floating point; ieee754; eigenvalue},
  file = {material/lawn172.pdf},
  abstract = {Bisection is one of the most common methods used to
                  compute the eigenvalues of symmetric tridiagonal
                  matrices. Bisection relies on the Sturm count: For a
                  given shift sigma, the number of negative pivots in
                  the factorization $T - \sigma I = LDL^T$ equals the
                  number of eigenvalues of T that are smaller than
                  sigma. In IEEE-754 arithmetic, the value $\infty$
                  permits the computation to continue past a zero
                  pivot, producing a correct Sturm count when $T$ is
                  unreduced. Demmel and Li showed [IEEE
                  Trans. Comput., 43 (1994), pp. 983–992] that using
                  $\infty$ rather than testing for zero pivots within
                  the loop could significantly improve performance on
                  certain architectures. When eigenvalues are to be
                  computed to high relative accuracy, it is often
                  preferable to work with $LDL^T$ factorizations
                  instead of the original tridiagonal $T$. One
                  important example is the MRRR algorithm. When
                  bisection is applied to the factored matrix, the
                  Sturm count is computed from $LDL^T$ which makes
                  differential stationary and progressive qds
                  algorithms the methods of choice. While it seems
                  trivial to replace $T$ by $LDL^T$, in reality these
                  algorithms are more complicated: In IEEE-754
                  arithmetic, a zero pivot produces an overflow
                  followed by an invalid exception (NaN, or ``Not a
                  Number'') that renders the Sturm count incorrect. We
                  present alternative, safe formulations that are
                  guaranteed to produce the correct
                  result. Benchmarking these algorithms on a variety
                  of platforms shows that the original formulation
                  without tests is always faster provided that no
                  exception occurs. The transforms see speed-ups of up
                  to 2.6x over the careful formulations. Tests on
                  industrial matrices show that encountering
                  exceptions in practice is rare. This leads to the
                  following design: First, compute the Sturm count by
                  the fast but unsafe algorithm. Then, if an exception
                  occurs, recompute the count by a safe, slower
                  alternative. The new Sturm count algorithms improve
                  the speed of bisection by up to 2x on our test
                  matrices. Furthermore, unlike the traditional
                  tiny-pivot substitution, proper use of IEEE-754
                  features provides a careful formulation that imposes
                  no input range restrictions.},
  projtag = {lapack, ieee754}
}
@misc{lapack-future,
  author = {E. Jason Riedy and Yozo Hida and James W. Demmel},
  ejr-withauthor = {Yozo Hida and James W. Demmel},
  title = {The Future of {LAPACK} and {ScaLAPACK}},
  howpublished = {Robert C. Thompson Matrix Meeting},
  dom = {18},
  month = nov,
  year = {2005},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/future-of-scalapack.pdf},
  file = {material/future-of-scalapack.pdf},
  role = {presentation},
  tags = {lapack; software engineering},
  abstract = {We are planning new releases of the widely used
                  LAPACK and ScaLAPACK numerical linear algebra
                  libraries. Based on an on-going user survey
                  (http://www.netlib.org/lapack-dev) and research by
                  many people, we are proposing the following
                  improvements: Faster algorithms (including better
                  numerical methods, memory hierarchy optimizations,
                  parallelism, and automatic performance tuning to
                  accomodate new architectures), more accurate
                  algorithms (including better numerical methods, and
                  use of extra precision), expanded functionality
                  (including updating and downdating, new
                  eigenproblems, etc. and putting more of LAPACK into
                  ScaLAPACK), and improved ease of use (friendlier
                  interfaces in multiple languages). To accomplish
                  these goals we are also relying on better software
                  engineering techniques and contributions from
                  collaborators at many institutions. This is joint
                  work with Jack Dongarra.},
  projtag = {lapack, ieee754}
}
@misc{bascd2006-poster,
  author = {E. Jason Riedy},
  title = {Making Static Pivoting Dependable},
  howpublished = {Seventh Bay Area Scientific Computing Day},
  month = mar,
  year = {2006},
  role = {poster},
  address = {Livermore, CA},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2006-poster.pdf},
  file = {material/bascd2006-poster.pdf},
  role = {poster},
  tags = {bascd; sparse matrix; linear algebra},
  abstract = {For sparse LU factorization, dynamic pivoting
                  tightly couples symbolic and numerical
                  computation. Dynamic structural changes limit
                  parallel scalability.  Demmel and Li use static
                  pivoting in distributed SuperLU for performance, but
                  intentionally perturbing the input may lead silently
                  to erroneous results.  Are there experimentally
                  stable static pivoting heuristics that lead to a
                  dependable direct solver? The answer is currently a
                  qualified yes. Current heuristics fail on a few
                  systems, but all failures are detectable.},
  projtag = {lapack, sparse-methods}
}
@article{axb-itref-toms,
  author = {James W. Demmel and Yozo Hida and W. Kahan and
                  Xiaoye S. Li and Sonil Mukherjee and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and W. Kahan and
                  Xiaoye S. Li and Sonil Mukherjee},
  title = {Error bounds from extra-precise iterative
                  refinement},
  journal = {{ACM} Transactions on Mathematical Software},
  year = {2006},
  volume = {32},
  number = {2},
  pages = {325--351},
  month = jun,
  role = {refereed},
  tags = {acm; toms; lapack; floating point; linear algebra},
  doi = {10.1145/1141885.1141894},
  issn = {0098-3500},
  mrclass = {65F10},
  mrnumber = {2272365},
  abstract = {We present the design and testing of an algorithm
                  for iterative refinement of the solution of linear
                  equations where the residual is computed with extra
                  precision. This algorithm was originally proposed in
                  1948 and analyzed in the 1960s as a means to compute
                  very accurate solutions to all but the most
                  ill-conditioned linear systems. However, two
                  obstacles have until now prevented its adoption in
                  standard subroutine libraries like LAPACK: (1) There
                  was no standard way to access the higher precision
                  arithmetic needed to compute residuals, and (2) it
                  was unclear how to compute a reliable error bound
                  for the computed solution. The completion of the new
                  BLAS Technical Forum Standard has essentially
                  removed the first obstacle. To overcome the second
                  obstacle, we show how the application of iterative
                  refinement can be used to compute an error bound in
                  any norm at small cost and use this to compute both
                  an error bound in the usual infinity norm, and a
                  componentwise relative error bound.},
  gt-role = {Assisted in software development, experimental
                  methods, writing, and editing.},
  file = {material/gesvxx.pdf},
  projtag = {lapack, ieee754}
}
@inproceedings{lapack-prospectus,
  author = {James W. Demmel and Jack Dongarra and Beresford
                  Parlett and W. Kahan and Ming Gu and David Bindel
                  and Yozo Hida and Xiaoye S. Li and Osni A. Marques
                  and E. Jason Riedy and Christof V{\"o}mel and Julien
                  Langou and Piotr Luszczek and Jakub Kurzak and
                  Alfredo Buttari and Julie Langou and Stanimire
                  Tomov},
  ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford
                  Parlett and W. Kahan and Ming Gu and David Bindel
                  and Yozo Hida and Xiaoye S. Li and Osni A. Marques
                  and Christof V{\"o}mel and Julien Langou and Piotr
                  Luszczek and Jakub Kurzak and Alfredo Buttari and
                  Julie Langou and Stanimire Tomov},
  title = {Prospectus for the Next {LAPACK} and {ScaLAPACK}
                  Libraries},
  booktitle = {{PARA'06}: State-of-the-Art in Scientific and
                  Parallel Computing},
  year = {2006},
  address = {Ume{\aa}, Sweden},
  month = jun,
  organization = {High Performance Computing Center North ({HPC2N})
                  and the Department of Computing Science, Ume{\aa}
                  University},
  publisher = {Springer},
  role = {proceedings},
  tags = {lapack},
  url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/para06-lapack.pdf},
  abstract = {LAPACK and ScaLAPACK are widely used software
                  libraries for numerical linear algebra. There have
                  been over 68M web hits at www.netlib.org for the
                  associated libraries LAPACK, ScaLAPACK, CLAPACK and
                  LAPACK95. LAPACK and ScaLAPACK are used to solve
                  leading edge science problems and they have been
                  adopted by many vendors and software providers as
                  the basis for their own libraries, including AMD,
                  Apple (under Mac OS X), Cray, Fujitsu, HP, IBM,
                  Intel, NEC, SGI, several Linux distributions (such
                  as Debian), NAG, IMSL, the MathWorks (producers of
                  MATLAB), Interactive Supercomputing, and PGI.
                  Future improvements in these libraries will
                  therefore have a large impact on users.},
  doi = {10.1007/978-3-540-75755-9\_2},
  file = {material/lapack-prospectus.pdf},
  projtag = {lapack}
}
@unpublished{lapack-style,
  author = {Jack Dongarra and Julien Langou and E. Jason Riedy},
  ejr-withauthor = {Jack Dongarra and Julien Langou},
  title = {Sca/{LAPACK} Program Style},
  month = aug,
  year = {2006},
  role = {unpublished},
  tags = {lapack},
  url = {http://www.netlib.org/lapack-dev/lapack-coding/program-style.html},
  abstract = {The purpose of this document is to facilitate
                  contributions to LAPACK and ScaLAPACK by documenting
                  their design and implementation guidelines. The
                  long-term goal is to provide guidelines for both
                  LAPACK and ScaLAPACK. However, the parallel
                  ScaLAPACK code has more open issues, so this
                  document primarily concerns LAPACK.},
  projtag = {lapack}
}
@article{tridiag-sisc,
  author = {Osni A. Marques and E. Jason Riedy and Christof
                  V{\"o}mel},
  ejr-withauthor = {Osni A. Marques and Christof V{\"o}mel},
  title = {Benefits of {IEEE-754} Features in Modern Symmetric
                  Tridiagonal Eigensolvers},
  journal = {SIAM Journal on Scientific Computing},
  year = {2006},
  month = sep,
  dom = 28,
  volume = {28},
  number = {5},
  pages = {1613--1633},
  role = {refereed},
  tags = {siam; sisc; floating point; eigenvalue; ieee754},
  doi = {10.1137/050641624},
  issn = {1064-8275},
  mrclass = {65F15},
  mrnumber = {2272181},
  abstract = {Bisection is one of the most common methods used to
                  compute the eigenvalues of symmetric tridiagonal
                  matrices. Bisection relies on the Sturm count: For a
                  given shift sigma, the number of negative pivots in
                  the factorization $T - \sigma I = LDL^T$ equals the
                  number of eigenvalues of T that are smaller than
                  sigma. In IEEE-754 arithmetic, the value $\infty$
                  permits the computation to continue past a zero
                  pivot, producing a correct Sturm count when $T$ is
                  unreduced. Demmel and Li showed [IEEE
                  Trans. Comput., 43 (1994), pp. 983–992] that using
                  $\infty$ rather than testing for zero pivots within
                  the loop could significantly improve performance on
                  certain architectures. When eigenvalues are to be
                  computed to high relative accuracy, it is often
                  preferable to work with $LDL^T$ factorizations
                  instead of the original tridiagonal $T$. One
                  important example is the MRRR algorithm. When
                  bisection is applied to the factored matrix, the
                  Sturm count is computed from $LDL^T$ which makes
                  differential stationary and progressive qds
                  algorithms the methods of choice. While it seems
                  trivial to replace $T$ by $LDL^T$, in reality these
                  algorithms are more complicated: In IEEE-754
                  arithmetic, a zero pivot produces an overflow
                  followed by an invalid exception (NaN, or ``Not a
                  Number'') that renders the Sturm count incorrect. We
                  present alternative, safe formulations that are
                  guaranteed to produce the correct
                  result. Benchmarking these algorithms on a variety
                  of platforms shows that the original formulation
                  without tests is always faster provided that no
                  exception occurs. The transforms see speed-ups of up
                  to 2.6x over the careful formulations. Tests on
                  industrial matrices show that encountering
                  exceptions in practice is rare. This leads to the
                  following design: First, compute the Sturm count by
                  the fast but unsafe algorithm. Then, if an exception
                  occurs, recompute the count by a safe, slower
                  alternative. The new Sturm count algorithms improve
                  the speed of bisection by up to 2x on our test
                  matrices. Furthermore, unlike the traditional
                  tiny-pivot substitution, proper use of IEEE-754
                  features provides a careful formulation that imposes
                  no input range restrictions.},
  gt-role = {Principal author and developer.},
  file = {material/ieee754-tridiag.pdf},
  projtag = {lapack}
}
@techreport{lapack-prospectus-lawn,
  author = {James W. Demmel and Jack Dongarra and Beresford
                  Parlett and W. Kahan and Ming Gu and David Bindel
                  and Yozo Hida and Xiaoye S. Li and Osni A. Marques
                  and E. Jason Riedy and Christof V{\"o}mel and Julien
                  Langou and Piotr Luszczek and Jakub Kurzak and
                  Alfredo Buttari and Julie Langou and Stanimire
                  Tomov},
  ejr-withauthor = {James W. Demmel and Jack Dongarra and Beresford
                  Parlett and W. Kahan and Ming Gu and David Bindel
                  and Yozo Hida and Xiaoye S. Li and Osni A. Marques
                  and Christof V{\"o}mel and Julien Langou and Piotr
                  Luszczek and Jakub Kurzak and Alfredo Buttari and
                  Julie Langou and Stanimire Tomov},
  title = {Prospectus for the Next {LAPACK} and {ScaLAPACK}
                  Libraries},
  institution = {Netlib},
  year = {2007},
  type = {LAPACK Working Note},
  number = {181},
  lawn = {181},
  month = feb,
  note = {Also issued as UT-CS-07-592},
  role = {techreport},
  tags = {lawn; lapack},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn181.pdf},
  file = {material/lawn181.pdf},
  projtag = {lapack}
}
@misc{bascd2007-poster,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and
                  E. Jason Riedy and Meghana Vishvanath and David Vu},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li and
                  Meghana Vishvanath and David Vu},
  title = {Precise Solutions for Overdetermined Least Squares
                  Problems},
  howpublished = {Stanford 50 -- Eighth Bay Area Scientific Computing
                  Day},
  month = mar,
  year = {2007},
  role = {poster},
  address = {Stanford, CA},
  role = {poster},
  tags = {bascd; least squares},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/bascd2007-poster.pdf},
  abstract = {Linear least squares (LLS) fitting is the most
                  widely used data modeling technique and is included
                  in almost every data analysis system
                  (e.g. spreadsheets). These software systems often
                  give no feedback on the conditioning of the LLS
                  problem or the floating-point calculation errors
                  present in the solution.  With limited use of extra
                  precision, we can eliminate these concerns for all
                  but the most ill-conditioned LLS problems. Our
                  algorithm provides either a solution and residual
                  with relatively tiny error or a notice that the LLS
                  problem is too ill-conditioned.},
  projtag = {lapack, ieee754}
}
@techreport{lawn188,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title = {Extra-precise iterative refinement for overdetermined least squares problems},
  type = {LAPACK Working Note},
  institution = {Netlib},
  year = 2007,
  number = 188,
  month = may,
  dom = 31,
  url = {http://www.netlib.org/lapack/lawnspdf/lawn188.pdf},
  file = {material/lawn188.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-77.html},
  note = {Also issued as UCB/EECS-2007-77; version accepted for TOMS.},
  abstract = {We present the algorithm, error bounds, and
                  numerical results for extra-precise iterative
                  refinement applied to overdetermined linear least
                  squares (LLS) problems. We apply our linear system
                  refinement algorithm to Bj{\"o}rck’s augmented linear
                  system formulation of an LLS problem. Our algorithm
                  reduces the forward normwise and componentwise
                  errors to $O(\varepsilon)$ unless the system is too
                  ill conditioned. In contrast to linear systems, we
                  provide two separate error bounds for the solution $x$
                  and the residual $r$. The refinement algorithm
                  requires only limited use of extra precision and
                  adds only $O(mn)$ work to the $O(mn^2)$ cost of QR
                  factorization for problems of size m-by-n. The extra
                  precision calculation is facilitated by the new
                  extended-precision BLAS standard in a portable way,
                  and the refinement algorithm will be included in a
                  future release of LAPACK and can be extended to the
                  other types of least squares problems.},
  role = {techreport},
  tags = {lawn; lapack; least squares; floating point},
  projtag = {lapack, ieee754}
}
@techreport{nonneg-house-lawn,
  author = {James W. Demmel and Mark Frederick Hoemmen and Yozo
                  Hida and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo
                  Hida},
  title = {Non-Negative Diagonals and High Performance on
                  Low-Profile Matrices from {Householder} {$QR$}},
  institution = {Netlib},
  year = {2008},
  type = {LAPACK Working Note},
  number = {203},
  lawn = {203},
  month = may,
  dom = 30,
  note = {Also issued as UCB/EECS-2008-76; modified from SISC
                  version.},
  role = {techreport},
  tags = {lawn; lapack; householder; qr},
  url = {http://www.netlib.org/lapack/lawnspdf/lawn203.pdf},
  other-url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-76.html},
  file = {material/lawn203.pdf},
  abstract = {The Householder reflections used in LAPACK's $QR$
                  factorization leave positive and negative real
                  entries along $R$'s diagonal. This is sufficient for
                  most applications of $QR$ factorizations, but a few
                  require that $R$ have a nonnegative diagonal. This
                  note describes a new Householder generation routine
                  to produce a nonnegative diagonal. Additionally, we
                  find that scanning for trailing zeros in the
                  generated reflections leads to large performance
                  improvements when applying reflections with many
                  trailing zeros. Factoring low-profile matrices,
                  those with nonzero entries mostly near the diagonal
                  (e.g., band matrices), now require far fewer
                  operations. For example, $QR$ factorization of
                  matrices with profile width b that are stored
                  densely in an $n\times n$ matrix improves from
                  $O(n^3)$ to $O(n^2+nb^2)$. These routines are in
                  LAPACK 3.2.},
  projtag = {lapack}
}
@techreport{ieee754-2008,
  note = {(committee member and contributor)},
  key = {IEEE Std 754-2008},
  journal = {IEEE Std 754-2008},
  type = {IEEE Std},
  number = {754-2008},
  title = {{IEEE} Standard for Floating-Point Arithmetic},
  year = {2008},
  pages = {1 -- 70},
  institution = {Microprocessor Standards Committee of the IEEE
                  Computer Society},
  abstract = {This standard specifies interchange and arithmetic
                  formats and methods for binary and decimal
                  floating-point arithmetic in computer programming
                  environments. This standard specifies exception
                  conditions and their default handling. An
                  implementation of a floating-point system conforming
                  to this standard may be realized entirely in
                  software, entirely in hardware, or in any
                  combination of software and hardware. For operations
                  specified in the normative part of this standard,
                  numerical results and exceptions are uniquely
                  determined by the values of the input data, sequence
                  of operations, and destination formats, all under
                  user control.},
  keywords = {IEEE standards;floating point
                  arithmetic;programming;IEEE standard;arithmetic
                  formats;computer programming;decimal floating-point
                  arithmetic;754-2008;NaN;arithmetic;binary;computer;decimal;exponent;floating-point;format;interchange;number;rounding;significand;subnormal},
  doi = {10.1109/IEEESTD.2008.4610935},
  isbn = {978-0-7381-5753-5},
  address = {New York, NY},
  month = aug,
  dom = 29,
  committee = {Alex Aiken and Matthew Applegate and David Bailey
                  and Steve Bass and Dileep Bhandarkar and Mahesh Bhat
                  and David Bindel and Sylvie Boldo and Stephen Canon
                  and Steven R. Carlough and Marius Cornea and Mike
                  Cowlishaw (editor) and John H. Crawford and Joseph
                  D. Darcy and Marc Daumas and Bob Davis and Mark
                  Davis and Dick Delp and Jim Demmel and Mark A. Erle
                  and Hossam A. H. Fahmy and J.P. Fasano and Richard
                  Fateman and Eric Feng and Warren E. Ferguson and
                  Alex Fit-Florea and Laurent Fournier and Chip
                  Freitag and Ivan Godard and Roger A. Golliver and
                  David Gustafson and Michel Hack and John R. Harrison
                  and John Hauser and Yozo Hida and Chris N. Hinds and
                  Graydon Hoare and David G. Hough and Jerry Huck and
                  Jim Hull and Michael Ingrassia and David V. James
                  and Rick James and William Kahan and John Kapernick
                  and Richard Karpinski and Jeff Kidder and Plamen
                  Koev and Ren-Cang Li and Zhishun Alex Liu and
                  Raymond Mak and Peter Markstein and David Matula and
                  Guillaume Melquiond and Nobuyoshi Mori and Ricardo
                  Morin and Ned Nedialkov and Craig Nelson and Stuart
                  Oberman and Jon Okada and Ian Ollmann and Michael
                  Parks and Tom Pittman and Eric Postpischil and Jason
                  Riedy and Debjit Das Sarma and Eric M. Schwarz and
                  David Scott and Don Senzig and Ilya Sharapov and Jim
                  Shearer and Michael Siu and Ron Smith and Chuck
                  Stevens and Peter Tang and Pamela J. Taylor and
                  James W. Thomas and Brandon Thompson and Wendy
                  Thrash and Neil Toda and Son Dao Trong and Leonard
                  Tsai and Charles Tsen and Fred Tydeman and Liang Kai
                  Wang and Scott Westbrook and Steve Winkler and
                  Anthony Wood and Umit Yalcinalp and Fred Zemke and
                  Paul Zimmermann and Dan Zuras (chair)},
  projtag = {ieee754}
}
@misc{cerfacs08,
  file = {material/cerfacs08.pdf},
  author = {E. Jason Riedy},
  title = {Auctions for Distributed (and Possibly Parallel)
                  Matchings},
  howpublished = {Visit to \href{http://www.cerfacs.fr/}{CERFACS}
                  courtesy of the Franco-Berkeley Fund},
  dom = {17},
  month = dec,
  year = {2008},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/cerfacs08.pdf},
  tags = {cerfacs; combinatorial optimization; sparse matrix},
  note = {Invited presentation},
  projtag = {sparse-methods}
}
@article{lsq-itref-toms,
  author = {James W. Demmel and Yozo Hida and Xiaoye S. Li and
                  E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Yozo Hida and Xiaoye S. Li},
  title = {Extra-precise iterative refinement for
                  overdetermined least squares problems},
  journal = {{ACM} Transactions on Mathematical Software},
  volume = {35},
  number = {4},
  year = 2009,
  month = feb,
  issn = {0098-3500},
  pages = {1--32},
  doi = {10.1145/1462173.1462177},
  accepted = {25 June 2008},
  role = {refereed},
  tags = {acm; toms; lapack; floating point; linear algebra},
  abstract = {We present the algorithm, error bounds, and
                  numerical results for extra-precise iterative
                  refinement applied to overdetermined linear least
                  squares (LLS) problems. We apply our linear system
                  refinement algorithm to Bj{\"o}rck’s augmented
                  linear system formulation of an LLS problem. Our
                  algorithm reduces the forward normwise and
                  componentwise errors to $O(\varepsilon)$ unless the
                  system is too ill conditioned. In contrast to linear
                  systems, we provide two separate error bounds for
                  the solution $x$ and the residual $r$. The
                  refinement algorithm requires only limited use of
                  extra precision and adds only $O(mn)$ work to the
                  $O(mn^2)$ cost of QR factorization for problems of
                  size m-by-n. The extra precision calculation is
                  facilitated by the new extended-precision BLAS
                  standard in a portable way, and the refinement
                  algorithm will be included in a future release of
                  LAPACK and can be extended to the other types of
                  least squares problems.},
  gt-role = {Assisted in software development, experimental
                  methods, writing, and editing.},
  file = {material/lsq_itrefx.pdf},
  projtag = {lapack, ieee754}
}
@article{nonneg-house-lawn-sisc,
  author = {James W. Demmel and Mark Frederick Hoemmen and Yozo
                  Hida and E. Jason Riedy},
  ejr-withauthor = {James W. Demmel and Mark Frederick Hoemmen and Yozo
                  Hida},
  title = {Non-Negative Diagonals and High Performance on
                  Low-Profile Matrices from {H}ouseholder {$QR$}},
  publisher = {SIAM},
  year = 2009,
  month = jul,
  dom = 3,
  journal = {SIAM Journal on Scientific Computing},
  volume = 31,
  number = 4,
  pages = {2832--2841},
  keywords = {LAPACK; QR factorization; Householder reflection;
                  floating-point},
  doi = {10.1137/080725763},
  role = {refereed},
  tags = {siam; sisc; lapack; householder; qr},
  issn = {1064-8275},
  mrclass = {65F30},
  mrnumber = {2520301},
  gt-role = {Principal author and developer.},
  abstract = {The Householder reflections used in LAPACK's $QR$
                  factorization leave positive and negative real
                  entries along $R$'s diagonal. This is sufficient for
                  most applications of $QR$ factorizations, but a few
                  require that $R$ have a nonnegative diagonal. This
                  note describes a new Householder generation routine
                  to produce a nonnegative diagonal. Additionally, we
                  find that scanning for trailing zeros in the
                  generated reflections leads to large performance
                  improvements when applying reflections with many
                  trailing zeros. Factoring low-profile matrices,
                  those with nonzero entries mostly near the diagonal
                  (e.g., band matrices), now require far fewer
                  operations. For example, $QR$ factorization of
                  matrices with profile width b that are stored
                  densely in an $n\times n$ matrix improves from
                  $O(n^3)$ to $O(n^2+nb^2)$. These routines are in
                  LAPACK 3.2.},
  file = {material/lawn203.pdf},
  projtag = {lapack}
}
@misc{gt09,
  file = {material/gt-2009-08-21.pdf},
  author = {E. Jason Riedy},
  title = {Dependable direct solutions for linear systems using
                  a little extra precision},
  howpublished = {CSE Seminar at Georgia Institute of Technology},
  dom = {21},
  month = aug,
  year = 2009,
  url = {http://hdl.handle.net/1853/29795},
  tags = {linear algebra; floating point; lapack},
  note = {Invited presentation},
  abstract = {Solving a square linear system $Ax=b$ often is
                  considered a black box. It's supposed to "just
                  work," and failures often are blamed on the original
                  data or subtleties of floating-point. Now that we
                  have an abundance of cheap computations, however, we
                  can do much better. A little extra precision in just
                  the right places produces accurate solutions cheaply
                  or demonstrates when problems are too hard to solve
                  without significant cost. This talk will outline the
                  method, iterative refinement with a new twist; the
                  benefits, small backward and forward errors; and the
                  trade-offs and unexpected benefits.},
  projtag = {lapack, sparse-methods, ieee754}
}
@inproceedings{mtaap10,
  author = {David Ediger and Karl Jiang and E. Jason Riedy and
                  David A. Bader},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader},
  title = {Massive Streaming Data Analytics: A Case Study with
                  Clustering Coefficients},
  booktitle = {4th Workshop on Multithreaded Architectures and
                  Applications (MTAAP)},
  role = {proceedings},
  tags = {parallel; graph; streaming},
  year = 2010,
  address = {Atlanta, GA},
  month = apr,
  dom = 23,
  acc-note = {(11/22 papers accepted, 50\% acceptance rate)},
  doi = {10.1109/IPDPSW.2010.5470687},
  file = {material/StreamingCC-MTAAP10.pdf},
  abstract = {We present a new approach for parallel massive
                  graph analysis of streaming, temporal data with a
                  dynamic and extensible representation.  Handling the
                  constant stream of new data from health care,
                  security, business, and social network applications
                  requires new algorithms and data structures.  We
                  examine data structure and algorithm trade-offs that
                  extract the parallelism necessary for
                  high-performance updating analysis of massive
                  graphs.  Static analysis kernels often rely on
                  storing input data in a specific structure.
                  Maintaining these structures for each possible
                  kernel with high data rates incurs a significant
                  performance cost.  A case study computing clustering
                  coefficients on a general-purpose data structure
                  demonstrates incremental updates can be more
                  efficient than global recomputation.  Within this
                  kernel, we compare three methods for dynamically
                  updating local clustering coefficients: a
                  brute-force local recalculation, a sorting
                  algorithm, and our new approximation method using a
                  Bloom filter.  On 32 processors of a \xmt{} with a
                  synthetic scale-free graph of $2^{24} \approx 16$
                  million vertices and $2^{29} \approx 537$ million
                  edges, the brute-force method processes a mean of
                  over 50\,000 updates per second and our Bloom filter
                  approaches 200\,000 updates per second.},
  projtag = {cassmt, intel-sting}
}
@inproceedings{icpp10,
  author = {David Ediger and Karl Jiang and E. Jason Riedy and
                  David A. Bader and Courtney Corley and Rob Farber
                  and William N. Reynolds},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader and
                  Courtney Corley and Rob Farber and William
                  N. Reynolds},
  title = {Massive Social Network Analysis: Mining Twitter for
                  Social Good},
  booktitle = {39th International Conference on Parallel Processing
                  ({ICPP})},
  role = {proceedings},
  tags = {parallel; graph},
  year = {2010},
  address = {San Diego, CA},
  month = sep,
  dom = 16,
  acc-note = {(70/225 papers accepted: 31.1\% acceptance rate)},
  doi = {10.1109/ICPP.2010.66},
  file = {material/ICPP10-GraphCT.pdf},
  abstract = {Social networks produce an enormous quantity of
                  data.  Facebook consists of over 400 million active
                  users sharing over 5 \emph{billion} pieces of
                  information each month.  Analyzing this vast
                  quantity of unstructured data presents challenges
                  for software and hardware.  We present GraphCT, a
                  \emph{Graph} \emph{C}haracterization \emph{T}ooklit
                  for massive graphs representing social network data.
                  On a 128-processor Cray XMT, GraphCT estimates the
                  betweenness centrality of an artificially generated
                  (R-MAT) 537 million vertex, 8.6 billion edge graph
                  in 55 minutes.  We use GraphCT to analyze public
                  data from Twitter, a microblogging network.
                  Twitter's message connections appear primarily
                  tree-structured as a news dissemination system.
                  Within the public data, however, are clusters of
                  conversations.  Using GraphCT, we can rank actors
                  within these conversations and help analysts focus
                  attention on a much smaller data subset.},
  projtag = {cassmt, intel-sting}
}
@incollection{rwp10,
  author = {E. Jason Riedy},
  editor = {Dana Martin Guthrie},
  booktitle = {Read Write Poem NaPoWriMo Anthology},
  title = {Here, on the farthest point of the peninsula},
  publisher = {\href{http://issuu.com}{issuu.com}},
  year = {2010},
  month = sep,
  dom = {15},
  pages = {86},
  url = {http://issuu.com/readwritepoem/docs/read_write_poem_napowrimo_anthology},
  myurl = {http://lovesgoodfood.com/jason/posts/post-0099/},
  role = {poetry},
  tags = {beach; napowrimo; poetry; rwp}
}
@inproceedings{nsfaccelws10,
  file = {material/nsf-workshop-socnet.pdf},
  author = {Jason Riedy and David Bader and David Ediger},
  ejr-withauthor = {David Bader and David Ediger},
  title = {Applications in Social Networks},
  booktitle = {NSF Workshop on Accelerators for Data-Intensive
                  Applications},
  dom = {13},
  year = {2010},
  month = oct,
  role = {presentation},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/nsf-workshop-socnet.pdf},
  tags = {graph; NSF; streaming},
  projtag = {intel-sting, cassmt}
}
@unpublished{graph500-1.1,
  author = {David A. Bader and Jonathan Berry and Simon Kahan
                  and Richard Murphy and E. Jason Riedy and Jeremiah
                  Willcock},
  ejr-withauthor = {David A. Bader and Jonathan Berry and Simon Kahan
                  and Richard Murphy and Jeremiah Willcock},
  title = {Graph 500 Benchmark 1 ("Search")},
  note = {Version 1.1},
  url = {http://www.graph500.org/Specifications.html},
  month = oct,
  year = 2010,
  projtag = {cassmt, percs}
}
@unpublished{nsf-accel-workshop,
  ejr-withauthor = {workshop participants},
  editor = {Viktor K. Prasanna and David A. Bader},
  key = {Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications},
  title = {{Report on NSF Workshop on Center Scale Activities Related to Accelerators for Data Intensive Applications}},
  note = {This workshop is supported by NSF Grant Number
                  1051537, in response to the Call for Exploratory
                  Workshop Proposals for Scientific Software
                  Innovation Institutes (S2I2).},
  dom = {31},
  month = oct,
  year = 2010
}
@inproceedings{ppopp11-graph,
  author = {David A. Bader and David Ediger and E. Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {16th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (PPoPP)},
  role = {tutorial},
  tags = {parallel; graph},
  year = 2011,
  month = feb,
  dom = 12,
  address = {San Antonio, TX},
  url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2011.html},
  abstract = {An increasingly fast-paced, digital world has produced
                  an ever-growing volume of petabyte-sized
                  datasets. At the same time, terabytes of new,
                  unstructured data arrive daily. As the desire to ask
                  more detailed questions about these massive streams
                  has grown, parallel software and hardware have only
                  recently begun to enable complex analytics in this
                  non-scientific space.

                  In this tutorial, we will discuss the open problems
                  facing us with analyzing this "data deluge". We will present
                  algorithms and data structures capable of analyzing
                  spatio-temporal data at massive scale on parallel
                  systems. We will try to understand the difficulties
                  and bottlenecks in parallel graph algorithm design
                  on current systems and will show how multithreaded
                  and hybrid systems can overcome these challenges. We
                  will demonstrate how parallel graph algorithms can
                  be implemented on a variety of architectures using
                  different programming models. 

                  The goal of this tutorial is to provide a
                  comprehensive introduction to the field of parallel
                  graph analysis to an audience with computing
                  background, interested in participating in research
                  and/or commercial applications of this
                  field. Moreover, we will cover leading-edge
                  technical and algorithmic developments in the field
                  and discuss open problems and potential solutions.},
  projtag = {cassmt}
}
@techreport{seed-set-tr,
  author = {Riedy, Jason and Bader, David A. and Jiang, Karl and
                  Pande, Pushkar and Sharma, Richa},
  ejr-withauthor = {Bader, David A. and Jiang, Karl and Pande, Pushkar
                  and Sharma, Richa},
  title = {Detecting Communities from Given Seeds in Social
                  Networks},
  institution = {Georgia Institute of Technology},
  year = 2011,
  number = {GT-CSE-11-01},
  month = feb,
  dom = 22,
  file = {material/GT-CSE-11-01.pdf},
  url = {http://hdl.handle.net/1853/36980},
  role = {techreport},
  abstract = {Analyzing massive social networks challenges both
                  high-performance computers and human
                  understanding. These massive networks cannot be
                  visualized easily, and their scale makes applying
                  complex analysis methods computationally
                  expensive. We present a region-growing method for
                  finding a smaller, more tractable subgraph, a
                  community, given a few example seed vertices. Unlike
                  existing work, we focus on a small number of seed
                  vertices, from two to a few dozen. We also present
                  the first comparison between five algorithms for
                  expanding a small seed set into a community. Our
                  comparison applies these algorithms to an R-MAT
                  generated graph component with 240 thousand vertices
                  and 32 million edges and evaluates the community
                  size, modularity, Kullback-Leibler divergence,
                  conductance, and clustering coefficient. We find
                  that our new algorithm with a local modularity
                  maximizing heuristic based on Clauset, Newman, and
                  Moore performs very well when the output is limited
                  to 100 or 1000 vertices. When run without a vertex
                  size limit, a heuristic from McCloskey and Bader
                  generates communities containing around 60\% of the
                  graph's vertices and having a small conductance and
                  modularity appropriate to the result size. A
                  personalized PageRank algorithm based on Andersen,
                  Lang, and Chung also performs well with respect to
                  our metrics.},
  tags = {graph; social network},
  projtag = {cassmt}
}
@incollection{smallstone10,
  author = {Jason Riedy},
  editor = {Fiona Robyn and Kaspalita},
  booktitle = {pay attention: a river of stones},
  title = {The storm's coming when the chickens spread out},
  publisher = {\href{http://lulu.com}{lulu.com}},
  year = {2011},
  month = mar,
  dom = {2},
  pages = 77,
  myurl = {http://lovesgoodfood.com/jason/posts/river-of-stones-7/},
  url = {http://www.lulu.com/product/file-download/pay-attention-a-river-of-stones/15057101},
  role = {poetry},
  tags = {poetry; aros; riverofstones}
}
@inproceedings{mtaap11,
  author = {David Ediger and E. Jason Riedy and David A. Bader
                  and Henning Meyerhenke},
  ejr-withauthor = {David Ediger and David A. Bader and Henning
                  Meyerhenke},
  title = {Tracking Structure of Streaming Social Networks},
  booktitle = {5th Workshop on Multithreaded Architectures and
                  Applications (MTAAP)},
  role = {proceedings},
  tags = {parallel; graph; streaming},
  year = 2011,
  month = may,
  abstract = {Current online social networks are massive and still
                  growing.  For example, Facebook has over 500 million
                  active users sharing over 30 billion items per
                  month.  The scale within these data streams has
                  outstripped traditional graph analysis methods.
                  Monitoring requires dynamic analysis rather than
                  repeated static analysis.  The massive state behind
                  multiple persistent queries requires shared data
                  structures and not problem-specific representations.
                  We present a framework based on the STINGER data
                  structure that can monitor a global property,
                  connected components, on a graph of 16 million
                  vertices at rates of up to 240\,000 updates per
                  second on a 32 processor Cray XMT.  For very large
                  scale-free graphs, our implementation uses novel
                  batching techniques that exploit the scale-free
                  nature of the data and run over three times faster
                  than prior methods.  Our framework handles, for the
                  first time, real-world data rates, opening the door
                  to higher-level analytics such as community and
                  anomaly detection.},
  acc-note = {(10/17 papers accepted, 59\% acceptance rate)},
  doi = {10.1109/IPDPS.2011.326},
  file = {material/TrackingComponents-MTAAP11.pdf},
  projtag = {cassmt, intel-sting}
}
@misc{GraphEx11,
  author = {Jason Riedy and David Ediger and David A. Bader and Henning Meyerhenke},
  ejr-withauthor = {David Ediger and David A. Bader and Henning Meyerhenke},
  title = {Tracking Structure of Streaming Social Networks},
  dom = 9,
  month = aug,
  year = 2011,
  note = {Invited presentation.},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GraphEx-2011.pdf},
  file = {material/GraphEx-2011.pdf},
  tags = {graph; streaming},
  howpublished = {2011 Graph Exploitation Symposium hosted by MIT Lincoln Labs},
  projtag = {intel-sting, cassmt}
}
@misc{intel.graph.2011,
  file = {material/intel-2011-08-09.pdf},
  author = {Jason Riedy and David A. Bader and Henning
                  Meyerhenke and David Ediger and Timothy Mattson},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke and David
                  Ediger and Timothy Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and
                  Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 9,
  month = aug,
  year = 2011,
  role = {presentation},
  url = {http://purl.oclc.org/NET/jason-riedy/resume/material/GT-STING-for-Intel-beamer.pdf},
  projtag = {intel-sting, cassmt}
}
@inproceedings{ppam11,
  author = {E. Jason Riedy and Henning Meyerhenke and David
                  Ediger and David A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David
                  A. Bader},
  title = {Parallel Community Detection for Massive Graphs},
  booktitle = {9th International Conference on Parallel Processing
                  and Applied Mathematics (PPAM11)},
  year = 2011,
  month = sep,
  publisher = {Springer},
  role = {proceedings},
  tags = {parallel; graph; community detection},
  abstract = {Tackling the current volume of graph-structured data
                  requires parallel tools.  We extend our work on
                  analyzing such massive graph data with the first
                  massively parallel algorithm for community detection
                  that scales to current data sizes, scaling to graphs
                  of over 122 million vertices and nearly 2 billion
                  edges in under 7300 seconds on a massively
                  multithreaded Cray XMT.  Our algorithm achieves
                  moderate parallel scalability without sacrificing
                  sequential operational complexity.  Community
                  detection partitions a graph into subgraphs more
                  densely connected within the subgraph than to the
                  rest of the graph.  We take an agglomerative
                  approach similar to Clauset, Newman, and Moore's
                  sequential algorithm, merging pairs of connected
                  intermediate subgraphs to optimize different graph
                  properties.  Working in parallel opens new
                  approaches to high performance.  On smaller data
                  sets, we find the output's modularity compares well
                  with the standard sequential algorithms.},
  acc-note = {(134/243 papers accepted, 55\% acceptance rate)},
  doi = {10.1007/978-3-642-31464-3\_29},
  file = {material/ppam11-community-detection.pdf},
  projtag = {cassmt, intel-sting}
}
@inproceedings{md11-graph,
  author = {David A. Bader and David Ediger and E. Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {full day tutorial},
  role = {tutorial},
  tags = {parallel; graph},
  year = 2011,
  month = sep,
  dom = 28,
  address = {Columbia, MD},
  projtag = {cassmt}
}
@inproceedings{DBLP:conf/dimacs/RiedyMEB12,
  file = {material/dimacs10-proceedings-community-detection.pdf},
  title = {Parallel community detection for massive graphs},
  author = {E. Jason Riedy and Henning Meyerhenke and David
                  Ediger and David A. Bader},
  booktitle = {Graph Partitioning and Graph Clustering},
  year = {2012},
  editor = {David A. Bader and Henning Meyerhenke and Peter
                  Sanders and Dorothea Wagner},
  pages = {207--222},
  publisher = {American Mathematical Society},
  series = {Contemporary Mathematics},
  volume = {588},
  abstract = {Tackling the current volume of graph-structured data
                  requires parallel tools. We extend our work on
                  analyzing such massive graph data with a massively
                  parallel algorithm for community detection that
                  scales to current data sizes, clustering a
                  real-world graph of over 100 million vertices and
                  over 3 billion edges in under 500 seconds on a
                  four-processor Intel E7-8870-based server.  Our
                  algorithm achieves moderate parallel scalability
                  without sacrificing sequential operational
                  complexity.  Community detection partitions a graph
                  into subgraphs more densely connected within the
                  subgraph than to the rest of the graph. We take an
                  agglomerative approach similar to Clauset, Newman,
                  and Moore’s sequential algorithm, merging pairs of
                  connected intermediate subgraphs to optimize
                  different graph properties. Working in parallel
                  opens new approaches to high performance. We improve
                  performance of our parallel community detection
                  algorithm on both the Cray XMT2 and OpenMP platforms
                  and adapt our algorithm to the DIMACS Implementation
                  Challenge data set.},
  bibsource = {DBLP, http://dblp.uni-trier.de},
  doi = {10.1090/conm/588/11703},
  ee = {http://www.ams.org/books/conm/588/11703},
  optisbn = {978-0-8218-9038-7, 978-0-8218-9869-7},
  isbn = {978-0-8218-9038-7},
  opttitle = {Graph Partitioning and Graph Clustering - 10th
                  DIMACS Implementation Challenge Workshop, Georgia
                  Institute of Technology, Atlanta, GA, USA, February
                  13-14, 2012. Proceedings},
  url = {http://www.ams.org/books/conm/588/11703/conm588-11703.pdf},
  projtag = {cassmt, intel-sting}
}
@incollection{dimacs10-workshop,
  author = {E. Jason Riedy and Henning Meyerhenke and David
                  Ediger and David A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David
                  A. Bader},
  title = {Parallel Community Detection for Massive Graphs},
  booktitle = {10th DIMACS Implementation Challenge Workshop -
                  Graph Partitioning and Graph Clustering},
  tags = {parallel; graph; community detection},
  publisher = {(workshop paper)},
  year = 2012,
  month = feb,
  dom = 14,
  address = {Atlanta, Georgia},
  note = {Won first place in the Mix Challenge and Mix Pareto
                  Challenge},
  url = {http://www.cc.gatech.edu/dimacs10/papers/\&\#91;15\&\#93;-dimacs10-community-detection.pdf},
  file = {material/dimacs10-community-detection.pdf},
  abstract = {Tackling the current volume of graph-structured data
                  requires parallel tools. We extend our work on
                  analyzing such massive graph data with a massively
                  parallel algorithm for community detection that
                  scales to current data sizes, clustering a
                  real-world graph of over 100 million vertices and
                  over 3 billion edges in under 500 seconds on a four-
                  processor Intel E7-8870-based server. Our algorithm
                  achieves moderate parallel scalability without
                  sacrificing sequential operational complexity.
                  Community detection partitions a graph into
                  subgraphs more densely connected within the subgraph
                  than to the rest of the graph. We take an
                  agglomerative approach similar to Clauset, Newman,
                  and Moore’s sequential algorithm, merging pairs of
                  connected intermediate subgraphs to optimize
                  different graph properties. Working in parallel
                  opens new approaches to high performance. We improve
                  performance of our parallel community detection
                  algorithm on both the Cray XMT2 and OpenMP platforms
                  and adapt our algorithm to the DIMACS Implementation
                  Challenge data set.},
  projtag = {cassmt, intel-sting}
}
@inproceedings{ppopp12-graph,
  author = {David Ediger and Jason Riedy and Rob McColl and
                  David A. Bader},
  ejr-withauthor = {David Ediger and Rob McColl and David A. Bader},
  title = {Parallel Programming for Graph Analysis},
  booktitle = {17th ACM SIGPLAN Annual Symposium on Principles and
                  Practice of Parallel Programming (PPoPP)},
  role = {tutorial},
  tags = {parallel; graph},
  year = 2012,
  month = feb,
  dom = 26,
  address = {New Orleans, LA},
  url = {http://www.cc.gatech.edu/~bader/papers/GraphAnalysisTutorial-PPoPP2012.html},
  abstract = {An increasingly fast-paced, digital world has
                  produced an ever-growing volume of petabyte-sized
                  datasets. At the same time, terabytes of new,
                  unstructured data arrive daily. As the desire to ask
                  more detailed questions about these massive streams
                  has grown, parallel software and hardware have only
                  recently begun to enable complex analytics in this
                  non-scientific space.  In this tutorial, we will
                  discuss the open problems facing us with analyzing
                  this "data deluge". We will present algorithms and
                  data structures capable of analyzing spatio-temporal
                  data at massive scale on parallel systems. We will
                  try to understand the difficulties and bottlenecks
                  in parallel graph algorithm design on current
                  systems and will show how multithreaded and hybrid
                  systems can overcome these challenges. We will
                  demonstrate how parallel graph algorithms can be
                  implemented on a variety of architectures using
                  different programming models.  The goal of this
                  tutorial is to provide a comprehensive introduction
                  to the field of parallel graph analysis to an
                  audience with computing background, interested in
                  participating in research and/or commercial
                  applications of this field. Moreover, we will cover
                  leading-edge technical and algorithmic developments
                  in the field and discuss open problems and potential
                  solutions.},
  officialproject = {nsf-s2i2-conc},
  projtag = {cassmt}
}
@misc{pp12-streaming-ms,
  file = {material/siam-pp-2012.pdf},
  author = {E. Jason Riedy and Henning Meyerhenke},
  ejr-withauthor = {Henning Meyerhenke},
  title = {Scalable Algorithms for Analysis of Massive,
                  Streaming Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = {15},
  month = feb,
  year = {2012},
  role = {Minisymposium organizer with Henning Meyerhenke.},
  url = {http://www.slideshare.net/jasonriedy/siam-pp-2012-scalable-algorithms-for-analysis-of-massive-streaming-graphs},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Savannah, GA},
  abstract = {Graph-structured data in social networks, finance,
                  network security, and others not only are massive
                  but also under continual change. These changes often
                  are scattered across the graph. Repeating complex
                  global analyses on massive snapshots to capture only
                  what has changed is inefficient. We discuss analysis
                  algorithms for streaming graph data that maintain
                  both local and global metrics. We extract
                  parallelism from both analysis kernel and graph data
                  to scale performance to real-world sizes.},
  projtag = {intel-sting, cassmt}
}
@misc{pp12-community-ms,
  author = {Henning Meyerhenke and E. Jason Riedy and David
                  A. Bader},
  ejr-withauthor = {Henning Meyerhenke and David A. Bader},
  title = {Parallel Community Detection in Streaming Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = {15},
  month = feb,
  year = {2012},
  role = {minisymposium organizer},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Savannah, GA},
  projtag = {intel-sting, cassmt}
}
@misc{pp12-GraphCT,
  author = {David Ediger and E. Jason Riedy and Henning
                  Meyerhenke and David A. Bader},
  eir-withauthor = {David Ediger and Henning Meyerhenke and David
                  A. Bader},
  title = {Analyzing Massive Networks with GraphCT},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = {16},
  month = feb,
  year = {2012},
  role = {poster},
  tags = {siam; parallel algorithms},
  address = {Savannah, GA},
  projtag = {intel-sting, cassmt}
}
@misc{pp12-STING,
  file = {material/siam-pp12-stinger-poster.pdf},
  author = {E. Jason Riedy and David Ediger and Henning
                  Meyerhenke and David A. Bader},
  eir-withauthor = {David Ediger and Henning Meyerhenke and David
                  A. Bader},
  title = {{STING}: Software for Analysis of Spatio-Temporal
                  Interaction Networks and Graphs},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = {16},
  month = feb,
  year = {2012},
  role = {poster},
  tags = {siam; parallel algorithms},
  address = {Savannah, GA},
  abstract = {Current tools for analyzing graph-structured data
                  and semantic networks focus on static graphs. Our
                  STING package tackles analysis of streaming graphs
                  like today's social networks and communication
                  tools. STING maintains a massive graph under changes
                  while coordinating analysis kernels to achieve
                  analysis at real-world data rates. We show examples
                  of local metrics like clustering coefficients and
                  global metrics like connected components and
                  agglomerative clustering. STING supports parallel
                  Intel architectures as well as the Cray XMT.},
  projtag = {intel-sting, cassmt}
}
@incollection{icassp2012-stinger,
  author = {Jason Riedy and Henning Meyerhenke and David
                  A. Bader and David Ediger and Timothy G. Mattson},
  ejr-withauthor = {Henning Meyerhenke and David Ediger and David
                  A. Bader and Timothy G. Mattson},
  booktitle = {{IEEE} International Conference on Acoustics, Speech
                  and Signal Processing ({ICASSP})},
  title = {Analysis of Streaming Social Networks and Graphs on
                  Multicore Architectures},
  year = {2012},
  month = mar,
  dom = 29,
  address = {Kyoto, Japan},
  file = {material/icassp2012.pdf},
  slide-url = {http://ur1.ca/i6dz6},
  doi = {10.1109/ICASSP.2012.6289126},
  opturl = {http://www.slideshare.net/jasonriedy/icassp-2012-analysis-of-streaming-social-networks-and-graphs-on-multicore-architectures},
  abstract = {Analyzing static snapshots of massive,
                  graph-structured data cannot keep pace with the
                  growth of social networks, financial transactions,
                  and other valuable data sources.  We introduce a
                  framework, STING (Spatio-Temporal Interaction
                  Networks and Graphs), and evaluate its performance
                  on multicore, multisocket Intel(R)-based platforms.
                  STING achieves rates of around 100\,000 edge updates
                  per second on large, dynamic graphs with a single,
                  general data structure.  We achieve speed-ups of up
                  to 1000$\times$ over parallel static computation,
                  improve monitoring a dynamic graph's connected
                  components, and show an exact algorithm for
                  maintaining local clustering coefficients performs
                  better on Intel-based platforms than our earlier
                  approximate algorithm.},
  projtag = {cassmt, intel-sting}
}
@inproceedings{mtaap12,
  author = {E. Jason Riedy and David A. Bader and Henning
                  Meyerhenke},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke},
  title = {Scalable Multi-threaded Community Detection in
                  Social Networks},
  booktitle = {6th Workshop on Multithreaded Architectures and
                  Applications (MTAAP)},
  role = {proceedings},
  tags = {parallel; graph; community detection},
  year = 2012,
  month = may,
  dom = 25,
  abstract = {The volume of existing graph-structured data
                  requires improved parallel tools and algorithms.
                  Finding communities, smaller subgraphs densely
                  connected within the subgraph than to the rest of
                  the graph, plays a role both in developing new
                  parallel algorithms as well as opening smaller
                  portions of the data to current analysis tools.  We
                  improve performance of our parallel community
                  detection algorithm by 20\% on the massively
                  multithreaded Cray XMT, evaluate its performance on
                  the next-generation Cray XMT2, and extend its reach
                  to Intel-based platforms with OpenMP.  To our
                  knowledge, not only is this the first massively
                  parallel community detection algorithm but also the
                  only such algorithm that achieves excellent
                  performance and good parallel scalability across all
                  these platforms.  Our implementation analyzes a
                  moderate sized graph with 105 million vertices and
                  3.3 billion edges in around 500 seconds on a four
                  processor, 80-logical-core Intel-based system and
                  1100 seconds on a 64-processor Cray XMT2.},
  acc-note = {(9/15 papers accepted, 60\% acceptance)},
  slide-url = {http://www.slideshare.net/jasonriedy/mtaap12-scalable-community-detection},
  doi = {10.1109/IPDPSW.2012.203},
  file = {material/mtaap12.pdf},
  projtag = {cassmt, intel-sting}
}
@misc{intel.graph.2012,
  file = {material/intel-2012-07-12.pdf},
  author = {Jason Riedy and David A. Bader and David Ediger and
                  Rob McColl and Timothy G. Mattson},
  ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and
                  Timothy G. Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and
                  Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 24,
  month = jul,
  year = 2012,
  role = {presentation},
  url = {http://www.slideshare.net/jasonriedy/gt-stingintelslides},
  projtag = {intel-sting}
}
@misc{an12-streaming-ms,
  file = {material/siam-an-2012.pdf},
  author = {David A. Bader and David Ediger and Jason Riedy},
  ejr-withauthor = {David A. Bader and David Ediger},
  title = {Streaming Graph Analytics for Massive Graphs},
  howpublished = {SIAM Annual Meeting},
  dom = 10,
  month = jul,
  year = {2012},
  url = {http://www.slideshare.net/jasonriedy/streaming-graph-analytics-for-massive-graphs},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Minneapolis, MN},
  abstract = {Emerging real-world graph problems include detecting
                  community structure in large social networks,
                  improving the resilience of the electric power grid,
                  and detecting and preventing disease in human
                  populations. The volume and richness of data
                  combined with its rate of change renders monitoring
                  properties at scale by static recomputation
                  infeasible. We approach these problems with massive,
                  fine-grained parallelism across different shared
                  memory architectures both to compute solutions and
                  to explore the sensitivity of these solutions to
                  natural bias and omissions within the data.},
  projtag = {intel-sting, cassmt}
}
@inproceedings{stinger-hpec12,
  author = {David Ediger and Robert McColl and Jason Riedy and
                  David A. Bader},
  ejr-withauthor = {David Ediger and Robert McColl and David A. Bader},
  title = {{STINGER}: High Performance Data Structure for
                  Streaming Graphs},
  tags = {parallel; graph; streaming},
  booktitle = {The IEEE High Performance Extreme Computing
                  Conference (HPEC)},
  year = 2012,
  month = sep,
  address = {Waltham, MA},
  note = {Best paper award},
  dom = 12,
  role = {proceedings},
  doi = {10.1109/HPEC.2012.6408680},
  file = {material/hpec12-stinger.pdf},
  abstract = {The current research focus on ``big data'' problems
                  highlights the scale and complexity of analytics
                  required and the high rate at which data may be
                  changing. In this paper, we present our high
                  performance, scalable and portable software,
                  Spatio-Temporal Interaction Networks and Graphs
                  Extensible Representation (STINGER), that includes a
                  graph data structure that enables these
                  applications. Key attributes of STINGER are fast
                  insertions, deletions, and updates on semantic
                  graphs with skewed degree distributions. We
                  demonstrate a process of algorithmic and
                  architectural optimizations that enable high
                  performance on the Cray XMT family and Intel
                  multicore servers. Our implementation of STINGER on
                  the Cray XMT processes over 3 million updates per
                  second on a scale-free graph with 537 million
                  edges.},
  projtag = {cassmt, intel-sting}
}
@misc{hpcs-panel-2012,
  optkey = {},
  author = {Lauren L. Smith and Dolores A. Shaffer},
  title = {{DARPA}'s {H}igh {P}roductivity {C}omputing
                  {S}ystems Program: A Final Report},
  howpublished = {Supercomputing Birds-of-a-Feather session},
  dom = 14,
  month = nov,
  year = 2012,
  note = {Invited panel speaker},
  abstract = {The DARPA High Productivity Computing Systems (HPCS)
                  program has been focused on providing a new
                  generation of economically viable high productivity
                  computing systems for national security, scientific,
                  industrial and commercial applications. This program
                  was unique because it focused on system productivity
                  that was defined to include enhancing performance,
                  programmability, portability, usability,
                  manageability and robustness of systems as opposed
                  to just being focused on one execution time
                  performance metric. The BOF is for anyone interested
                  in learning about the two HPCS systems and how
                  productivity in High Performance Computing has been
                  enhanced.},
  optannote = {},
  projtag = {percs, cassmt}
}
@misc{siamcse13-largescalegraph,
  author = {David A. Bader and Henning Meyerhenke and Jason
                  Riedy},
  ejr-withauthor = {David A. Bader and Henning Meyerhenke},
  title = {Applications and Challenges in Large-scale Graph
                  Analysis},
  howpublished = {SIAM Conference on Computational Science and
                  Engineering},
  month = feb,
  year = 2013,
  address = {Boston, MA},
  role = {presentation},
  tags = {siam; parallel algorithms},
  url = {http://www.graphanalysis.org/SIAM-CSE13/01_Bader.pdf},
  abstract = {Emerging real-world graph problems include detecting
                  community structure in large social networks,
                  improving the resilience of the electric power grid,
                  and detecting and preventing disease in human
                  populations. We discuss the opportunities and
                  challenges in massive data-intensive computing for
                  applications in social network analysis, genomics,
                  and security. The explosion of real-world graph data
                  poses substantial challenges for software, hardware,
                  algorithms, and application experts.},
  projtag = {intel-sting, cassmt, grateful}
}
@misc{siamcse13-streaminggraph,
  file = {material/cse2013-streaming.pdf},
  author = {Robert C. McColl and David Ediger and David A. Bader
                  and Jason Riedy},
  ejr-withauthor = {Robert C. McColl and David Ediger and David
                  A. Bader},
  title = {Analyzing Graph Structure in Streaming Data with
                  {STINGER}},
  howpublished = {SIAM Conference on Computational Science and
                  Engineering},
  month = feb,
  year = 2013,
  address = {Boston, MA},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  abstract = {Analyzing static snapshots of massive,
                  graph-structured data cannot keep pace with the
                  growth of social networks, financial transactions,
                  and other valuable data sources. Our software
                  framework, STING (Spatio-Temporal Interaction
                  Networks and Graphs), uses a scalable,
                  high-performance graph data structure to enable
                  these applications. STING supports fast insertions,
                  deletions, and updates on graphs with semantic
                  information and skewed degree distributions. STING
                  achieves large speed-ups over parallel, static
                  recomputation on both common multicore and
                  specialized multithreaded platforms.},
  projtag = {intel-sting, cassmt, grateful}
}
@inproceedings{mtaap13,
  file = {material/mtaap13-streaming-community-monitoring.pdf},
  author = {E. Jason Riedy and David A. Bader},
  ejr-withauthor = {David A. Bader},
  title = {Multithreaded Community Monitoring for Massive
                  Streaming Graph Data},
  booktitle = {7th Workshop on Multithreaded Architectures and
                  Applications (MTAAP)},
  role = {proceedings},
  tags = {parallel; graph; streaming; community detection},
  year = 2013,
  month = may,
  dom = 24,
  doi = {10.1109/IPDPSW.2013.229},
  address = {Boston, MA},
  acc-note = {(11/16 papers accepted, 69\% acceptance)},
  abstract = {Analyzing static snapshots of massive,
                  graph-structured data cannot keep pace with the
                  growth of social networks, financial transactions,
                  and other valuable data sources.  Current
                  state-of-the-art industrial methods analyze these
                  streaming sources using only simple, aggregate
                  metrics.  There are few existing scalable algorithms
                  for monitoring complex global quantities like
                  decomposition into community structure.  Using our
                  framework STING, we present the first known parallel
                  algorithm specifically for monitoring communities in
                  this massive, streaming, graph-structured data.  Our
                  algorithm performs incremental re-agglomeration
                  rather than starting from scratch after each batch
                  of changes, reducing the problem's size to that of
                  the change rather than the entire graph.  We analyze
                  our initial implementation's performance on
                  multithreaded platforms for execution time and
                  latency.  On an Intel-based multithreaded platform,
                  our algorithm handles up to 100 million updates per
                  second on social networks with one to 30 million
                  edges, providing a speed-up from 4$\times$ to
                  3700$\times$ over statically recomputing the
                  decomposition after each batch of changes.  Possibly
                  because of our artificial graph generator, resulting
                  communities' modularity varies little from the
                  initial graph.},
  projtag = {xscala, grateful, cassmt, intel-sting}
}
@unpublished{s2i2-ipdps-2013,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna
                  and Manish Parashar and David Bader and Jason Riedy
                  and Richard Vuduc},
  title = {Report on ``Workshop on Accelerating Bioinformatics
                  Applications Enabled by NextGen-Sequencing''},
  address = {Boston, MA},
  dom = 19,
  month = may,
  year = 2013,
  note = {Co-located with IPDPS 2013},
  url = {http://future-compute.usc.edu/index.php/NGS_Workshop},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala}
}
@incollection{GraphCT-Wiley-Chap,
  file = {material/STINGER-with-apps.pdf},
  title = {Computational Graph Analytics for Massive Streaming
                  Data},
  author = {David Ediger and Jason Riedy and David A. Bader and
                  Henning Meyerhenke},
  booktitle = {Large Scale Network-Centric Computing Systems},
  publisher = {Wiley},
  year = {2013},
  chapter = {25},
  editor = {Hamid Sarbazi-azad and Albert Zomaya},
  month = jul,
  series = {Parallel and Distributed Computing},
  abstract = {Handling the constant stream of data from health
                  care, security, business, and social network
                  applications requires new algorithms and data
                  structures.  We present a new approach for parallel
                  massive analysis of streaming, temporal,
                  graph-structured data.  For this purpose we examine
                  data structure and algorithm trade-offs that extract
                  the parallelism necessary for high-performance
                  updating analysis of massive graphs. As a result of
                  this study, we propose the extensible and flexible
                  data structure for massive graphs called STINGER
                  ({S}patio-{T}emporal {I}nteraction {N}etworks and
                  {G}raphs {E}xtensible {R}epresentation).  Two case
                  studies demonstrate our new approach's
                  effectiveness.  The first one computes a dynamic
                  graph's vertices' \emph{clustering coefficients}.
                  We show that incremental updates are far more
                  efficient than global recomputation.  Within this
                  kernel, we compare three methods for dynamically
                  updating local clustering coefficients: a
                  brute-force local recalculation, a sorting
                  algorithm, and our new approximation method using a
                  Bloom filter.  On 32 processors of a \xmt{} with a
                  synthetic scale-free graph of $2^{24} \approx 16$
                  million vertices and $2^{29} \approx 537$ million
                  edges, the brute-force method processes a mean of
                  over 50\,000 updates per second, while our Bloom
                  filter approaches 200\,000 updates per second.  The
                  second case study monitors a global feature, a
                  dynamic graph's connected components.  We use
                  similar algorithmic ideas as before to exploit the
                  parallelism in the problem and provided by the
                  hardware architecture.  On a 16 million vertex
                  graph, we obtain rates of up to 240\,000 updates per
                  second on 32 processors of a \xmt{}.  For the large
                  scale-free graphs typical in our applications, our
                  implementation uses novel batching techniques that
                  exploit the scale-free nature of the data and run
                  over three times faster than prior methods.  Our new
                  framework is the first to handle real-world data
                  rates, opening the door to higher-level analytics
                  such as community and anomaly detection.},
  dom = {30},
  ejr-withauthor = {David Ediger and David A. Bader and Henning
                  Meyerhenke},
  isbn = {978-0470936887},
  role = {chapter},
  tags = {parallel; graph; streaming},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6674282&tag=1},
  projtag = {cassmt, xscala, intel-sting}
}
@misc{graphlab-2013,
  file = {material/graphlab13-poster.pdf},
  author = {Jason Riedy},
  title = {{STINGER}: Analyzing massive, streaming graphs},
  howpublished = {2nd GraphLab Workshop},
  month = jul,
  dom = 1,
  year = 2013,
  address = {San Francisco, CA},
  role = {invited poster and demo},
  tags = {graph analysis; parallel algorithms},
  projtag = {intel-sting, cassmt, xscala, grateful}
}
@article{graphct-tpds-2012,
  author = {David Ediger and Karl Jiang and Jason Riedy and
                  David A. Bader},
  ejr-withauthor = {David Ediger and Karl Jiang and David A. Bader},
  title = {{GraphCT}: Multithreaded Algorithms for Massive
                  Graph Analysis},
  journal = {{IEEE} Transactions in Parallel and Distributed
                  Systems},
  year = 2013,
  pages = {2220 -- 2229},
  month = sep,
  doi = {10.1109/TPDS.2012.323},
  issn = {1045-9219},
  opturl = {http://dx.doi.org/10.1109/TPDS.2012.323},
  role = {refereed},
  gt-role = {Assisted in software development, experimental
                  methods, writing, and editing.},
  abstract = {The digital world has given rise to massive
                  quantities of data that include rich semantic and
                  complex networks. A social graph, for example,
                  containing hundreds of millions of actors and tens
                  of billions of relationships is not
                  uncommon. Analyzing these large data sets, even to
                  answer simple analytic queries, often pushes the
                  limits of algorithms and machine architectures. We
                  present GraphCT, a scalable framework for graph
                  analysis using parallel and multithreaded algorithms
                  on shared memory platforms. Utilizing the unique
                  characteristics of the Cray XMT, GraphCT enables
                  fast network analysis at unprecedented scales on a
                  variety of input data sets. On a synthetic power law
                  graph with 2 billion vertices and 17 billion edges,
                  we can find the connected components in 2
                  minutes. We can estimate the betweenness centrality
                  of a similar graph with 537 million vertices and
                  over 8 billion edges in under 1 hour. GraphCT is
                  built for portability and performance.},
  file = {material/GraphCT-IEEE.pdf},
  projtag = {cassmt}
}
@unpublished{s2i2-acmbcb-2013,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna
                  and Manish Parashar and David Bader and Jason Riedy
                  and Richard Vuduc},
  title = {Report on ``Workshop on Challenges in accelerating Next-Gen Sequencing ({NGS}) bioinformatics''},
  address = {Washington, DC},
  dom = 25,
  month = sep,
  year = 2013,
  note = {in conjunction with ACM-BCB 2013},
  url = {http://future-compute.usc.edu/index.php/NGS_Bioinformatics_Workshop},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala}
}
@inproceedings{wssspe1,
  author = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna
                  and Manish Parashar and Jason Riedy and David Bader
                  and Richard Vuduc},
  ejr-withauthor = {Shel Swenson and Yogesh Simmhan and Viktor Prasanna
                  and Manish Parashar and David Bader and Richard
                  Vuduc},
  title = {Sustainable Software Development for Next-Gen
                  Sequencing (NGS) Bioinformatics on Emerging
                  Platforms},
  booktitle = {First Workshop on Sustainable Software for Science:
                  Practice and Experiences (WSSSPE1)},
  year = 2013,
  month = nov,
  dom = 17,
  address = {Denver, CO},
  note = {held in conjunction with SC13, published
                  electronically
                  (\url{http://wssspe.researchcomputing.org.uk/})},
  url = {http://arxiv.org/abs/1309.1828},
  file = {material/wssspe13.pdf},
  abstract = {DNA sequence analysis is fundamental to life science
                  research. The rapid development of next generation
                  sequencing (NGS) technologies, and the richness and
                  diversity of applications it makes feasible, have
                  created an enormous gulf between the potential of
                  this technology and the development of computational
                  methods to realize this potential. Bridging this gap
                  holds possibilities for broad impacts toward
                  multiple grand challenges and offers unprecedented
                  opportunities for software innovation and
                  research. We argue that NGS-enabled applications
                  need a critical mass of sustainable software to
                  benefit from emerging computing platforms'
                  transformative potential. Accumulating the necessary
                  critical mass will require leaders in computational
                  biology, bioinformatics, computer science, and
                  computer engineering work together to identify core
                  opportunity areas, critical software infrastructure,
                  and software sustainability challenges. Furthermore,
                  due to the quickly changing nature of both
                  bioinformatics software and accelerator technology,
                  we conclude that creating sustainable accelerated
                  bioinformatics software means constructing a
                  sustainable bridge between the two fields. In
                  particular, sustained collaboration between domain
                  developers and technology experts is needed to
                  develop the accelerated kernels, libraries,
                  frameworks and middleware that could provide the
                  needed flexible link from NGS bioinformatics
                  applications to emerging platforms.},
  officialproject = {nsf-s2i2-conc},
  projtag = {xscala}
}
@misc{intel.graph.2014,
  file = {material/intel-2014-01-17.pdf},
  author = {Jason Riedy and David A. Bader and David Ediger and
                  Rob McColl and Timothy G. Mattson},
  ejr-withauthor = {David A. Bader and David Ediger and Rob McColl and
                  Timothy G. Mattson},
  title = {{STING}: Spatio-Temporal Interaction Networks and
                  Graphs for {Intel} Platforms},
  howpublished = {Presentation at Intel Corporation, Santa Clara, CA},
  dom = 17,
  month = jan,
  year = 2014,
  role = {presentation},
  url = {http://www.slideshare.net/jasonriedy/intel-20140117},
  projtag = {intel-sting}
}
@inproceedings{stinger-gabb2014,
  file = {material/gabb-2014-pres.pdf},
  author = {Jason Riedy and David A. Bader},
  title = {{STINGER}: Multi-threaded Graph Streaming},
  dom = 19,
  booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB}
                  2014)},
  year = 2014,
  month = may,
  address = {Phoeniz, AZ},
  note = {Invited presentation and panelist. (Workshop with
                  IPDPS 2014)},
  tags = {parallel; graph; streaming},
  url = {http://www.slideshare.net/jasonriedy/stinger-multithreaded-graph-streaming},
  projtag = {intel-sting, xscala, hpda, percs}
}
@misc{graphlab-2014,
  file = {material/graphlab14-poster.pdf},
  author = {Jason Riedy},
  title = {{STINGER}: Analyzing massive, streaming graphs},
  howpublished = {3rd GraphLab Workshop},
  month = jul,
  dom = 21,
  year = 2014,
  address = {San Francisco, CA},
  role = {invited poster and demo},
  tags = {graph analysis; parallel algorithms},
  projtag = {intel-sting, hpda, xscala, grateful}
}
@inproceedings{bc-hpec14,
  author = {Adam McLaughlin and Jason Riedy and David A. Bader},
  ejr-withauthor = {Adam McLaughlin and David A. Bader},
  title = {Optimizing Energy Consumption and Parallel
                  Performance for Betweenness Centrality using {GPU}s},
  tags = {parallel; graph; energy},
  booktitle = {The IEEE High Performance Extreme Computing
                  Conference (HPEC)},
  year = 2014,
  month = sep,
  address = {Waltham, MA},
  note = {``Rising Stars'' section},
  dom = 11,
  role = {proceedings},
  doi = {10.1109/HPEC.2014.7040980},
  file = {material/Optimizing_BC_HPEC14.pdf},
  abstract = {Applications of high-performance graph analysis
                  range from computational biology to network security
                  and even transportation.  These applications often
                  consider graphs under rapid change and are moving
                  beyond HPC platforms into energy-constrained
                  embedded systems.  This paper optimizes one
                  successful and demanding analysis kernel,
                  betweenness centrality, for NVIDIA GPU accelerators
                  in both environments.  Our algorithm for static
                  analysis is capable of exceeding 2 million traversed
                  edges per second per watt (MTEPS/W).  Optimizing the
                  parallel algorithm and treating the dynamic problem
                  directly achieves a 6.39$\times$ average speed-up
                  and 84\% average reduction in energy consumption.},
  projtag = {xscala, grateful, hpda}
}
@inproceedings{cmg.2014,
  file = {material/cmg-2014-11-04.pdf},
  author = {Jason Riedy and David A. Bader},
  title = {Graph Analysis Trends and Opportunities},
  booktitle = {CMG Performance and Capacity},
  year = 2014,
  month = nov,
  dom = 4,
  address = {Atlanta, GA},
  note = {Invited presentation},
  tags = {parallel; graph; streaming},
  url = {http://www.slideshare.net/jasonriedy/cmg-20141104},
  abstract = {High-performance graph analysis is unlocking
                  knowledge in problems like anomaly detection in
                  computer security, community structure in social
                  networks, and many other data integration
                  areas. While graphs provide a convenient
                  abstraction, real-world problems' sparsity and lack
                  of locality challenge current systems. This talk
                  will cover current trends ranging from massive
                  scales to low-power, low-latency systems and
                  summarize opportunities and directions for graphs
                  and computing systems.},
  projtag = {hpda, grateful, xscala, intel-sting}
}
@misc{mng-sdm15,
  file = {material/riedy-network-challenge-sdm15.pdf},
  author = {Jason Riedy},
  title = {Network Challenge: Error and Sensitivity Analysis},
  howpublished = {SDM-Networks 2015: The Second SDM Workshop on Mining Networks and Graphs: A Big Data Analytic Challenge},
  dom = 2,
  month = may,
  year = 2015,
  note = {Invited panelist},
  address = {Vancouver, BC},
  tags = {graph analysis},
  url = {http://www.slideshare.net/jasonriedy/network-challenge-error-and-sensitivity-analysis},
  projtag = {xscala, grateful, hpda}
}
@inproceedings{hpec15,
  author = {Adam McLaughlin and Jason Riedy and David A. Bader},
  title = {An Energy-Efficient Abstraction for Simultaneous
                  Breadth-First Searches},
  ejr-withauthor = {Adam McLaughlin and David A. Bader},
  tags = {parallel; graph; energy},
  booktitle = {The IEEE High Performance Extreme Computing
                  Conference (HPEC)},
  year = 2015,
  month = sep,
  address = {Waltham, MA},
  dom = 17,
  role = {proceedings},
  abstract = {Optimized GPU kernels are sufficiently complicated
                  to write that they often are specialized to specific
                  input data, target architectures, or applications.
                  This paper presents a multi-search abstraction for
                  computing multiple breadth-first searches in
                  parallel and demonstrates a high-performance,
                  general implementation.  Our abstraction removes the
                  burden of orchestrating graph traversal from the
                  user while providing high performance and low energy
                  usage, an often overlooked component of algorithm
                  design.  Energy consumption has become a first-class
                  hardware design constraint for both massive and
                  embedded computing platforms.  Our abstraction can
                  be applied to such problems as the all-pairs
                  shortest-path problem, community detection,
                  reachability querying, and others.  To map graph
                  traversal efficiently to NVIDIA GPUs, our hybrid
                  implementation chooses between processing active
                  vertices with a single thread or an entire warp
                  based on vertex outdegree.  For a set of twelve
                  varied graphs, the implementation of our abstraction
                  saves 42\% time and 62\% energy on average compared
                  to representative implementations of specific
                  applications from existing literature.},
  file = {material/multi_search_energy.pdf},
  projtag = {xscala, grateful, hpda}
}
@misc{dmml-2015,
  file = {material/dmml-2015-ejr.pdf},
  author = {E. Jason Riedy},
  title = {Graph Analysis Beyond Linear Algebra},
  howpublished = {Development of Modern Methods for Linear Algebra},
  month = oct,
  dom = 24,
  year = {2015},
  abstract = {High-performance graph analysis is unlocking
                  knowledge in computer security, bioinformatics,
                  social networks, and many other data integration
                  areas. Graphs provide a convenient abstraction for
                  many data problems beyond linear algebra. Some
                  problems map directly to linear algebra. Others,
                  like community detection, look eerily similar to
                  sparse linear algebra techniques. And then there are
                  algorithms that strongly resist attempts at making
                  them look like linear algebra. This talk will cover
                  recent results with an emphasis on streaming graph
                  problems where the graph changes and results need
                  updated with minimal latency. We’ll also touch on
                  issues of sensitivity and reliability where graph
                  analysis needs to learn from numerical analysis and
                  linear algebra.},
  note = {Invited presentation},
  url = {http://www.slideshare.net/jasonriedy/graph-analysis-beyond-linear-algebra},
  projtag = {xscala, grateful, hpda, lapack}
}
@inproceedings{caa16,
  author = {David Bader and Aleksandra Michalewicz and Oded
                  Green and Jessie Birkett-Rees and Jason Riedy and
                  James Fairbanks and Anita Zakrzewska},
  title = {Semantic database applications at the Samtavro
                  Cemetery, Georgia},
  booktitle = {The 44th Computer Applications and Quantitative
                  Methods in Archaeology Conference ({CAA})},
  year = 2016,
  month = mar,
  dom = 30,
  address = {Oslo, Norway},
  abstract = {In 2013 a paper was offered to the CAA concerning archaeological legacy data and semantic database applications, with some preliminary results for a study conducted into the Samtavro cemetery, situated in the South Caucasus in the modern republic of Georgia. The present paper presents further research outcomes of data mining the Samtavro material. Over four thousand graves were excavated at this site, used most intensively during the Late Bronze and Iron Ages, and later in the Roman and Late Antique periods. The current project focuses on the latter period—and the legacy of Soviet and post-Soviet excavations—in a collaborative effort between computer scientists based at the Georgia Institute of Technology, USA, and archaeologists at the University of Melbourne and Monash University, Australia. 

Data for 1075 tombs, 1249 individuals, and 5842 grave accoutrements were collected across 74 data fields, resulting in the identification of 9 tomb types, 37 artefact types and 320 artefact subtypes. Methods tested against the Samtavro material culture included the application of clustering techniques to understand associations of related items based on patterns of co-occurrence, using traditional data mining (hierarchical link clustering) and spectral graph theory—focusing on tomb types in relation to artefact types. The other method calculated the probability of each event occurring and comparing this to what we would expect if these were truly random—focusing on artefact types in relation to biological sex and age brackets.

In some instances, our work confirmed previously established relationships, but it likewise revealed new results concerning particular entities. The project demonstrates that although sites for which comprehensive archival records exist can benefit from these types of approaches, often the greatest limitation in taking a ‘big data’ approach is the relative scarcity of archaeological data.},
  projtag = {xscala}
}
@misc{pp16-streaming-ms,
  author = {E. Jason Riedy and David A. Bader},
  ejr-withauthor = {David A. Bader},
  title = {Scalable Network Analysis: Tools, Algorithms,
                  Applications},
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 15,
  month = apr,
  year = 2016,
  note = {Minisymposium organizer with Henning Meyerhenke and David A. Bader.},
  url = {http://www.slideshare.net/jasonriedy/scalable-and-efficient-algorithms-for-analysis-of-massive-streaming-graphs-60975076},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Paris, France},
  abstract = { Graph analysis provides tools for analyzing the
                  irregular data sets common in health informatics,
                  computational biology, climate science, sociology,
                  security, finance, and many other fields. These
                  graphs possess different structures than typical
                  finite element meshes. Scaling graph analysis to the
                  scales of data being gathered and created has
                  spawned many directions of exciting new research.
                  This minisymposium includes talks on massive graph
                  generation for testing and evaluating parallel
                  algorithms, novel streaming techniques, and parallel
                  graph algorithms for new and existing problems. It
                  also covers existing parallel frameworks and
                  interdisciplinary applications, e.g. the analysis of
                  climate networks.},
  projtag = {hpda, grateful, xscala}
}
@inproceedings{gabb16-pr,
  author = {Jason Riedy},
  title = {Updating PageRank for Streaming Graphs},
  booktitle = {{Graph} {Algorithms} {Building} {Blocks} ({GABB}
                  2016)},
  year = 2016,
  dom = 23,
  month = may,
  address = {Chicago, IL},
  note = {(Workshop with IPDPS 2016)},
  tags = {parallel; graph; streaming; pagerank},
  abstract = {Incremental graph algorithms can respond quickly to
                  small changes in massive graphs by updating rather
                  than recomputing analysis metrics.  Here we use the
                  linear system formulation of PageRank and ideas from
                  iterative refinement to compute the update to a
                  PageRank vector accurately and quickly.  The core
                  idea is to express the residual of the original
                  solution with respect to the updated matrix
                  representing the graph.  The update to the residual
                  is sparse.  Solving for the solution update with a
                  straight-forward iterative method spreads the change
                  outward from the change locations but converges
                  before traversing the entire graph.  We achieve
                  speed-ups of 2$\times$ to over 40$\times$ relative
                  to a restarted, highly parallel PageRank iteration
                  for small, low-latency batches of edge insertions.
                  These cases traverse 2$\times$ to nearly
                  10\,000$\times$ fewer edges than the restarted
                  PageRank iteration.  This provides an interesting
                  test case for the ongoing GraphBLAS effort: Can the
                  APIs support our incremental algorithms cleanly and
                  efficiently?},
  file = {material/streaming-pagerank-gabb2016.pdf},
  projtag = {grateful, hpda, xscala}
}
@inproceedings{pmma16-fpaddre,
  author = {{Dukhan}, Marat and {Vuduc}, Richard and {Riedy},
                  Jason},
  title = {Wanted: Floating-Point Add Round-off Error
                  Instruction},
  booktitle = {The 2nd International Workshop on Performance
                  Modeling: Methods and Applications ({PMMA16})},
  year = 2016,
  month = jun,
  dom = 23,
  address = {Frankfurt, Germany},
  note = {(Workshop with ISC High Performance)},
  abstract = {We propose a new instruction (FPADDRE) that computes
                  the round-off error in floating-point addition. We
                  explain how this instruction benefits high-precision
                  arithmetic operations in applications where double
                  precision is not sufficient. Performance estimates
                  on Intel Haswell, Intel Skylake, and AMD Steamroller
                  processors, as well as Intel Knights Corner
                  co-processor, demonstrate that such an instruction
                  would improve the latency of double-double addition
                  by up to 55\% and increase double-double addition
                  throughput by up to 103\%, with smaller, but
                  non-negligible benefits for double-double
                  multiplication. The new instruction delivers up to
                  2x speedups on three benchmarks that use
                  high-precision floating-point arithmetic:
                  double-double matrix-matrix multiplication,
                  compensated dot product, and polynomial evaluation
                  via the compensated Horner scheme.},
  url = {http://arxiv.org/abs/1603.00491},
  eprint = {arXiv:1603.00491},
  primaryclass = {cs.NA},
  projtag = {ieee754, grateful, lapack, xscala}
}
@article{Holder:2016:CFC:2980765.2980770,
  author = {Holder, Lawrence B. and Caceres, Rajmonda and
                  Gleich, David F. and Riedy, Jason and Khan, Maleq
                  and Chawla, Nitesh V. and Kumar, Ravi and Wu,
                  Yinghui and Klymko, Christine and Eliassi-Rad, Tina
                  and Prakash, Aditya},
  title = {Current and Future Challenges in Mining Large
                  Networks: Report on the Second SDM Workshop on
                  Mining Networks and Graphs},
  journal = {SIGKDD Explorations Newsletter},
  issue_date = {June 2016},
  volume = {18},
  number = {1},
  month = aug,
  year = {2016},
  issn = {1931-0145},
  pages = {39--45},
  numpages = {7},
  doi = {10.1145/2980765.2980770},
  acmid = {2980770},
  publisher = {ACM},
  address = {New York, NY, USA},
  keywords = {Network mining, big data, challenges, graph mining},
  projtag = {grateful, hpda}
}
@misc{blas-ng-feb-2017,
  author = {James Demmel and Greg Henry and Xiaoye Li and Jason Riedy and Peter Tang},
  title = {A Proposal for a Next-Generation BLAS},
  howpublished = {Workshop on Batched, Reproducible, and Reduced Precision BLAS},
  month = feb,
  year = 2017,
  dom = 24,
  address = {Atlanta, Georgia},
  url = {http://www.netlib.org/utk/people/JackDongarra/WEB-PAGES/Batched-BLAS-2017/talk05-demmel.pdf},
  projtag = {lapack, ieee754, xscala}
}
@misc{cse17-streaming-ms,
  author = {E. Jason Riedy},
  title = {High-Performance Analysis of Streaming Graphs},
  howpublished = {SIAM Conference on Computational Science and Engineering},
  dom = 2,
  month = mar,
  year = 2017,
  note = {Minisymposium organizer with Henning Meyerhenke.},
  url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Atlanta, GA},
  abstract = { Graph-structured data in social networks, finance,
                  network security, and others not only are massive
                  but also under continual change. These changes often
                  are scattered across the graph. Stopping the world
                  to run a single, static query is
                  infeasible. Repeating complex global analyses on
                  massive snapshots to capture only what has changed
                  is inefficient. We discuss requirements for
                  single-shot queries on changing graphs as well as
                  recent high-performance algorithms that update
                  rather than recompute results. These algorithms are
                  incorporated into our software framework for
                  streaming graph analysis, STING (Spatio-Temporal
                  Interaction Networks and Graphs).},
  projtag = {hpda, grateful, memory-centric, xscala, crnch-rg}
}
@misc{acs-2017,
  author = {Jason Riedy},
  title = {High-Performance Analysis of Streaming Graphs},
  howpublished = {HPC Analytic Workshop},
  month = jun,
  year = 2017,
  dom = 28,
  address = {Hanover, MD},
  url = {https://www.slideshare.net/jasonriedy/highperformance-analysis-of-streaming-graphs-77348572},
  projtag = {hpda, grateful, memory-centric, crnch-rg}
}
@inproceedings{siamns17-ejr,
  author = {E. Jason Riedy and Chunxing Yin and David A. Bader},
  title = {A New Algorithm Model for Massive-Scale Streaming
                  Graph Analysis},
  booktitle = {SIAM Workshop on Network Science},
  year = 2017,
  month = jul,
  dom = 14,
  address = {Pittsburgh, PA},
  url = {https://www.slideshare.net/jasonriedy/a-new-algorithm-model-for-massivescale-streaming-graph-analysis},
  projtag = {hpda, memory-centric, grateful, crnch-rg}
}
@article{localcomm-2017,
  author = {Eisha Nathan and Anita Zakrzewska and Jason Riedy
                  and David A. Bader},
  title = {Local Community Detection in Dynamic Graphs Using
                  Personalized Centrality},
  journal = {Algorithms},
  year = 2017,
  month = aug,
  abstract = {Analyzing massive graphs poses challenges due to the
                  vast amount of data available.  Extracting smaller
                  relevant subgraphs allows for further visualization
                  and analysis that would otherwise be too
                  computationally intensive.  Furthermore, many real
                  data sets are constantly changing, and require
                  algorithms to update as the graph evolves.  This
                  work addresses the topic of local community
                  detection, or seed set expansion, using personalized
                  centrality measures, specifically PageRank and Katz
                  centrality.  We present a method to efficiently
                  update local communities in dynamic graphs. By
                  updating the personalized ranking vectors, we can
                  incrementally update the corresponding local
                  community.  Applying our methods on real-world
                  graphs, we are able to obtain speedups of up to
                  60$\times$ compared to static recomputation while
                  maintaining an average recall of 0.94 of the highly
                  ranked vertices returned.  Next, we investigate how
                  approximations of a centrality vector affect the
                  resulting local community. Specifically, our method
                  that guarantees that the vertices returned in the
                  community are the highly ranked vertices from a
                  personalized centrality metric.},
  issn = {1999-4893},
  doi = {10.3390/a10030102},
  volume = 10,
  number = 3,
  article_number = {102},
  projtag = {grateful, hpda}
}
@misc{ieeecluster2017,
  author = {Eisha Nathan and Anita Zakrzewska and Chunxing Yin and Jason Riedy},
  title = {A New Direction for Streaming Graph Analysis},
  howpublished = {IEEE Cluster},
  month = sep,
  dom = 6,
  year = 2017,
  address = {Honolulu, HI},
  tags = {graph analysis; parallel algorithms},
  projtag = {hpda, memory-centric, grateful, crnch-rg}
}
@misc{sc17-blas-bof,
  author = {E. Jason Riedy and Greg Henry and James Demmel and Mark Gates and Xiaoye S. Li and Ping Tak P. Tang},
  title = {A Proposal for a Next-Generation BLAS},
  howpublished = {Batched, Reproducible, and Reduced Precision BLAS Birds-of-a-Feather at the International Conference for High Performance Computing, Networking, Storage and Analysis},
  month = nov,
  year = 2017,
  url = {http://icl.utk.edu/bblas/sc17/files/bblas-sc17-riedy.pdf},
  projtag = {lapack, ieee754, xscala}
}
@unpublished{ngblas-doc,
  author = {James Demmel and Mark Gates and Greg Henry and Xiaoye S. Li and
                  Jason Riedy and P.T. Peter Tang},
  title = {A Proposal for a Next-Generation BLAS},
  note = {(living document, being updated)},
  month = nov,
  year = 2017,
  url = {https://goo.gl/hvDu3d},
  projtag = {lapack}
}
@misc{pp18-ejr,
  author = {Jason Riedy},
  title = {Graph Analysis: New Algorithm Models, New
                  Architectures },
  howpublished = {SIAM Parallel Processing for Scientific Computing},
  dom = 8,
  month = mar,
  year = 2018,
  note = {Minisymposium organizer with Oded Green and David
                  A. Bader.},
  role = {presentation},
  tags = {siam; streaming data; parallel algorithms},
  address = {Tokyo, Japan},
  projtag = {hpda, memory-centric, crnch-rg}
}
@inproceedings{mlg2018_23,
  title = {A New Algorithmic Model for Graph Analysis of Streaming Data},
  author = {Chunxing Yin and Jason Riedy},
  booktitle = {Proceedings of the 14th International Workshop on Mining and Learning with Graphs ({MLG})},
  month = may,
  year = 2018,
  url = {http://www.mlgworkshop.org/2018/papers/MLG2018_paper_23.pdf}
}
@inproceedings{ashes2018-ejr,
  author = {Eric Hein and Tom Conte and Jeffrey S. Young and
                  Srinivas Eswar and Jiajia Li and Patrick Lavin and
                  Richard Vuduc and Jason Riedy},
  title = {An Initial Characterization of the {Emu} {Chick}},
  booktitle = {The Eighth International Workshop on Accelerators
                  and Hybrid Exascale Systems ({AsHES})},
  role = {proceedings},
  tags = {parallel; graph; streaming; community detection},
  year = 2018,
  month = may,
  doi = {10.1109/IPDPSW.2018.00097},
  pages = {579--588},
  isbn = 9781538655559,
  abstract = {The Emu Chick is a prototype system designed around
                  the concept of migratory memory-side
                  processing. Rather than transferring large amounts
                  of data across power-hungry, high-latency
                  interconnects, the Emu Chick moves lightweight
                  thread contexts to near-memory cores before the
                  beginning of each memory read. The current prototype
                  hardware uses FPGAs to implement cache-less
                  "Gossamer" cores for doing computational work and a
                  stationary core to run basic operating system
                  functions and migrate threads between nodes. In this
                  initial characterization of the Emu Chick, we study
                  the memory bandwidth characteristics of the system
                  through benchmarks like STREAM, pointer chasing, and
                  sparse matrix vector multiply. We compare the Emu
                  Chick hardware to architectural simulation and Intel
                  Xeon-based platforms. While it is difficult to
                  accurately compare prototype hardware with existing
                  systems, our initial evaluation demonstrates that
                  the Emu Chick uses available memory bandwidth more
                  efficiently than a more traditional, cache-based
                  architecture. Moreover, the Emu Chick provides
                  stable, predictable performance with 80\% bandwidth
                  utilization on a random-access pointer chasing
                  benchmark with weak locality.},
  keywords = {Instruction sets;Bandwidth;Computer
                  architecture;Benchmark
                  testing;Hardware;Prototypes;Kernel;benchmarking;streaming
                  graphs;computer architecture;sparse tensors;emu},
  projtag = {memory-centric, crnch-rg}
}
@inproceedings{cf18-ejr,
  author = {Jason Riedy},
  title = {Streaming Graph Analysis: New Models, New
                  Architectures },
  booktitle = {{ACM} International Conference on Computing
                  Frontiers},
  year = 2018,
  month = may,
  dom = 9,
  note = {Invited paper and talk.},
  url = {https://www.slideshare.net/jasonriedy/graph-analysis-new-algorithm-models-new-architectures},
  projtag = {hpda, xscala, grateful}
}
@inproceedings{arith18-ejr,
  author = {Jason Riedy and James Demmel},
  title = {Augmented Arithmetic Operations Proposed for
                  {IEEE}-754 2018},
  booktitle = {25th {IEEE} Symposium on Computer Arithmetic
                  ({ARITH} 25)},
  year = {2018},
  dom = 26,
  month = jun,
  optdoi = {10.1109/ARITH.2018.xx},
  projtag = {ieee754, lapack, grateful, xscala}
}
@inproceedings{arith18-754,
  author = {Jason Riedy},
  title = {Plans for IEEE Standard 754-2028},
  booktitle = {25th {IEEE} Symposium on Computer Arithmetic
                  ({ARITH} 25)},
  year = {2018},
  dom = 26,
  month = jun,
  optdoi = {10.1109/ARITH.2018.xx},
  projtag = {xscala, grateful, ieee754, memory-centric}
}
@techreport{1808.06334,
  author = {Will Powell and Jason Riedy and Jeffrey S. Young},
  title = {Wrangling Rogues: Managing Experimental Post-Moore
                  Architectures},
  year = {2018},
  month = aug,
  eprint = {arXiv:1808.06334},
  archiveprefix = {arXiv},
  abstract = {The Rogues Gallery is a new experimental testbed
                  that is focused on tackling "rogue" architectures
                  for the Post-Moore era of computing. While some of
                  these devices have roots in the embedded and
                  high-performance computing spaces, managing current
                  and emerging technologies provides a challenge for
                  system administration that are not always foreseen
                  in traditional data center environments.  We present
                  an overview of the motivations and design of the
                  initial Rogues Gallery testbed and cover some of the
                  unique challenges that we have seen and foresee with
                  upcoming hardware prototypes for future post-Moore
                  research. Specifically, we cover the networking,
                  identity management, scheduling of resources, and
                  tools and sensor access aspects of the Rogues
                  Gallery and techniques we have developed to manage
                  these new platforms.},
  url = {https://arxiv.org/abs/1808.06334}
}
@misc{mlg18-ejr,
  author = {Chunxing Yin and Jason Riedy and David Bader},
  title = {Validity of Graph Algorithms on Streaming Data},
  howpublished = {14th International Workshop on Mining and Learning with Graphs},
  address = {London, UK},
  month = aug,
  year = 2018,
  note = {(to appear)},
  projtag = {hpda, memory-centric, crnch-rg}
}

This file was generated by bibtex2html 1.99.