@PhDThesis{	  itlic:2003-010,
  author	= {Dan Wallin},
  title		= {Exploiting Data Locality in Adaptive Architectures},
  school	= {Department of Information Technology, Uppsala University},
  department	= {Division of Computer Systems},
  year		= {2003},
  number	= {2003-010},
  type		= {Licentiate thesis},
  month		= sep,
  abstract	= {The speed of processors increases much faster than the
		  memory access time. This makes memory accesses expensive.
		  To meet this problem, cache hierarchies are introduced to
		  serve the processor with data. However, the effectiveness
		  of caches depends on the amount of locality in the
		  application's memory access pattern. The behavior of
		  various programs differs greatly in terms of cache miss
		  characteristics, access patterns and communication
		  intensity. Therefore a computer built for many different
		  computational tasks potentially benefits from dynamically
		  adapting to the varying needs of the applications.
		  
		  This thesis shows that a cc-NUMA multiprocessor with data
		  migration and replication optimizations efficiently
		  exploits the temporal locality of algorithms. The
		  performance of the self-optimizing system is similar to a
		  system with a perfect initial thread and data placement.
		  
		  Data locality optimizations are not for free. Large cache
		  line coherence protocols improve spatial locality but yield
		  increases in false sharing misses for many applications.
		  Prefetching techniques that reduce the cache misses often
		  lead to increased address and data traffic. Several
		  techniques introduced in this thesis efficiently avoid
		  these drawbacks. The bundling technique reduces the
		  coherence traffic in multiprocessor prefetchers. This is
		  especially important in snoop-based systems where the
		  coherence bandwidth is a scarce resource. Bundled
		  prefetchers manage to reduce both the cache miss rate and
		  the coherence traffic compared with non-prefetching
		  protocols. The most efficient bundled prefetching protocol
		  studied, lowers the cache misses by 27 percent and the
		  address snoops by 24 percent relative to a non-prefetching
		  protocol on average for all examined applications. Another
		  proposed technique, capacity prefetching, avoids false
		  sharing misses by distinguishing between cache lines
		  involved in communication from non-communicating cache
		  lines at run-time.}
}