@techreport{TR-IC-PFG-21-29,
number = {IC-PFG-21-29},
author = {Victor Ferreira {Ferrari} and Guido Costa Souza de
{Araujo}},
title = {{Improving Convolutions with Tensor Hardware
Accelerators}},
month = {December},
year = {2021},
institution = {Institute of Computing, University of Campinas},
note = {In English, 35 pages.
\par\selectlanguage{english}\textbf{Abstract}
Convolutional Neural Network (CNN) models are among the most
popular choices for deep learning solutions to problems with
huge data sets. Given that CNNs are very computationally
expensive, optimizing convolutions is central to enable larger
models and speed up inference time. Tensor operations, e.g.
matrix multiplication, have increasingly relied on hardware
accelerators, such as IBM POWER10's MMA engine. \par This work
explores how to exploit MMA and the POWER10 architecture to
improve convolution performance, and proposes a novel algorithm
for the operation, named Convolution Slicing Optimization
(CSO), which tiles the instance into multiple sub-problems, and
schedules the resulting tiles so to minimize DRAM memory
accesses. After the convolution is tiled, a micro-kernel is
used to increase throughput with the MMA engine. \par To
evaluate the proposed approach, a set of experiments was
performed using a POWER10 CPU, and the results show that CSO is
capable of efficiently tile the convolution according to a set
ofparameters calculated at compile time. Speedups of up to
$229\%$ result when comparing the CSO convolution-based slicing
technique to a widely used reduction to matrix multiplication.
}
}