@PhDThesis{	  itlic:2004-002,
  author	= {Markus Nord{\'e}n},
  title		= {Parallel {PDE} Solvers on cc-{NUMA} Systems},
  school	= {Department of Information Technology, Uppsala University},
  department	= {Division of Scientific Computing},
  year		= {2004},
  number	= {2004-002},
  type		= {Licentiate thesis},
  month		= mar,
  note		= {Included papers are available at: Paper A:
		  \url{http://www.sciencedirect.com/science/article/B6V06-49W6S4M-1/2/23cb25585b2742595f319f4cedd0b65f},
		  Paper C:
		  \url{http://www.it.uu.se/research/publications/lic/2004-002/2004-002-C.pdf},
		  Paper D:
		  \url{http://www.it.uu.se/research/publications/reports/2004-006}}
		  ,
  abstract	= {The current trend in parallel computers is that systems
		  with a large shared memory are becoming more and more
		  popular. A shared memory system can be either a uniform
		  memory architecture (UMA) or a cache coherent non-uniform
		  memory architecture (cc-NUMA).
		  
		  In the present thesis, the performance of parallel PDE
		  solvers on cc-NUMA computers is studied. In particular, we
		  consider the shared namespace programming model,
		  represented by OpenMP. Since the main memory is physically,
		  or \emph{geographically} distributed over several
		  multi-processor nodes, the latency for local memory
		  accesses is smaller than for remote accesses. Therefore,
		  the \emph{geographical locality} of the data becomes
		  important.
		  
		  The questions posed in this thesis are: (1)~How large is
		  the influence on performance of the non-uniformity of the
		  memory system? (2)~How should a program be written in order
		  to reduce this influence? (3)~Is it possible to introduce
		  optimizations in the computer system for this purpose?
		  
		  Most of the application codes studied address the Euler
		  equations using a finite difference method and a finite
		  volume method respectively and are parallelized with
		  OpenMP. Comparisons are made with an alternative
		  implementation using MPI and with PDE solvers implemented
		  with OpenMP that solve other equations using different
		  numerical methods.
		  
		  The main conclusion is that geographical locality is
		  important for performance on cc-NUMA systems. This can be
		  achieved through self optimization provided in the system
		  or through migrate-on-next-touch directives that could be
		  inserted automatically by the compiler.
		  
		  We also conclude that OpenMP is competitive with MPI on
		  cc-NUMA systems if care is taken to get a favourable data
		  distribution. }
}