@TechReport{ it:2003-029, author = {Oskar Grenholm and Zoran Radovic and Erik Hagersten}, title = {Latency-hiding and Optimizations of the {DSZOOM} Instrumentation System}, institution = {Department of Information Technology, Uppsala University}, department = {Division of Computer Systems}, year = {2003}, number = {2003-029}, month = may, abstract = {An efficient and robust instrumentation tool (or compiler support) is necessary for an efficient implementation of fine-grain software-based shared memory systems (SW-DSMs). The DSZOOM system, developed by the Uppsala Architecture Research Team (UART) at Uppsala University, is a sequentially consistent fine-grained SW-DSM originally developed using Executable Editing Library (EEL)---a binary modification tool from University of Wisconsin-Madison. In this paper, we identify several weaknesses of this original approach and present a new and simple tool for assembler instrumentation: Sparc Assembler Instrumentation Tool (SAIT). This tool can instrument (modify) a highly optimized assembler output from the compiler for the newest UltraSPARC processors. Currently, the focus of the tool is load-, store-, and load-store-instrumentation. By using the SAIT, we develop and present several low-level instrumentation optimization techniques that significantly improve the performance of the original DSZOOM system. One of the presented techniques is a write permission cache (WPC), a latency-hiding mechanism for memory-store operations, that can lower the instrumentation overheads for some applications (as much as 45\% for LU-cont, running on two nodes with 8 processors each). Finally, we demonstrate that this new DSZOOM system executes faster than the old one for all 13 applications studied, from the SPLASH-2 benchmark suite. Execution time improvement factors range from 1.07 to 2.82 (average 1.73). } }