@article{bakhoda2013designing, author = {Bakhoda, Ali and Kim, John and Aamodt, Tor M.}, title = {Designing On-Chip Networks for Throughput Accelerators}, year = {2013}, month = {September}, abstract = {As the number of cores and threads in throughput accelerators such as Graphics Processing Units (GPU) increases, so does the importance of on-chip interconnection network design. This article explores throughput-effective Network-on-Chips (NoC) for future compute accelerators that employ Bulk-Synchronous Parallel (BSP) programming models such as CUDA and OpenCL. A hardware optimization is “throughput effective” if it improves parallel application-level performance per unit chip area. We evaluate performance of future looking workloads using detailed closed-loop simulations modeling compute nodes, NoC, and the DRAM memory system. We start from a mesh design with bisection bandwidth balanced to off-chip demand. Accelerator workloads tend to demand high off-chip memory bandwidth which results in a many-to-few traffic pattern when coupled with expected technology constraints of slow growth in pins-per-chip. Leveraging these observations we reduce NoC area by proposing a “checkerboard” NoC which alternates between conventional full routers and half routers with limited connectivity. Next, we show that increasing network terminal bandwidth at the nodes connected to DRAM controllers alleviates a significant fraction of the remaining imbalance resulting from the many-to-few traffic pattern. Furthermore, we propose a “double checkerboard inverted” NoC organization which takes advantage of channel slicing to reduce area while maintaining the performance improvements of the aforementioned techniques. This organization also has a simpler routing mechanism and improves average application throughput per unit area by 24.3%.}, publisher = {ACM}, url = {http://approjects.co.za/?big=en-us/research/publication/designing-chip-networks-throughput-accelerators/}, pages = {35}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, volume = {10}, edition = {ACM Transactions on Architecture and Code Optimization (TACO)}, }