Parallel matrix-multiplciation

Bonita Montero

unread,

Nov 4, 2019, 4:49:31 AM11/4/19

to

I wrote a little program that does a parallel matrix-multiplication.
It multiplies two matrices which have a configurable size in megabytes
given as the first commandline-parameter. The second commandline-para-
meter is the limit of the number of threads; if it is omitted, the num-
ber of threads is the number of hw-threads in the system.
I found that, on my Ryzen 7 1800X, the performance is about 2,5% higher
with 64MB-matrices when I limit the number of threads to 8 so that each
core is assigned only a single thread by the OS. So cache-thrashing
seems to be an issue here. This isn't a measurement-error for sure
because i've run the code with "start /b /wait /high ...".

So here's the code:

#include <iostream>
#include <vector>
#include <random>
#include <stdexcept>
#include <utility>
#include <utility>
#include <thread>
#include <cstdlib>
#include <cmath>

using namespace std;

unsigned threadLimit = 0;

struct Matrix
{
Matrix() = default;
Matrix( size_t n );
Matrix( Matrix const &other );
Matrix( Matrix &&other );
double *operator []( size_t i ) const;
Matrix &operator =( Matrix const &other );
Matrix &operator =( Matrix &&other );
size_t dim() const;

private:
void init( size_t n );
void copy( Matrix const &other );
vector<double> m_mtx;
vector<double *> m_ptrs;
};

inline
Matrix::Matrix( size_t n )
{
init( n );
}

inline
Matrix::Matrix( Matrix const &other )
{
copy( other );
}

inline
Matrix::Matrix( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
}

inline
double *Matrix::operator []( size_t i ) const
{
return m_ptrs[i];
}

Matrix &Matrix::operator =( Matrix const &other )
{
copy( other );
return *this;
}

inline
Matrix &Matrix::operator =( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
return *this;
}

inline
size_t Matrix::dim() const
{
return m_ptrs.size();
}

inline
void Matrix::init( size_t n )
{
m_mtx.resize( n * n, 0.0 );
m_ptrs.resize( n, nullptr );
for( size_t i = 0, j = 0; i != n; ++i, j += n )
m_ptrs[i] = &m_mtx[j];
}

inline
void Matrix::copy( Matrix const &other )
{
size_t n = other.m_ptrs.size();
init( n );
for( size_t i = 0; i != n; ++i )
for( size_t j = 0; j != n; ++j )
m_ptrs[i][j] = other.m_ptrs[i][j];
}

Matrix operator *( Matrix const &a, Matrix const &b )
{
size_t n;
if( (n = a.dim()) != b.dim() )
throw invalid_argument( "unequal matrix-dimensions" );
Matrix mtx( n );
auto matrixpart = [&a, &b, &mtx, n]( size_t begin, size_t end )
{
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
{
mtx[i][j] = 0.0;
for( size_t k = 0; k != n; ++k )
mtx[i][j] += a[i][k] * b[k][j];
}
};
vector<thread> threads;
double part = (double)n / ::threadLimit,
p = 0.0;
for( unsigned t = 0; ; ++t, p += part )
{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;
if( t == ::threadLimit - 1 )
{
matrixpart( begin, n );
break;
}
threads.emplace_back( matrixpart, begin, end );
}
for( thread &thr : threads )
thr.join();
return move( mtx );
}

int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
size_t mtxDim = (size_t)sqrt( (size_t)(unsigned)atoi( argv[1]
) * 1024 * 1024 / 8 );
Matrix mtxA( mtxDim ),
mtxB( mtxDim ),
mtxC;
cout << "matrix-dimension: " << mtxDim << endl;
if( (::threadLimit = thread::hardware_concurrency()) == 0 )
::threadLimit = 1;
if( argc >= 3 )
{
unsigned tlParam = (unsigned)atoi( argv[2] );
tlParam += tlParam == 0;
if( tlParam < (::threadLimit = thread::hardware_concurrency()) )
::threadLimit = tlParam;
}
cout << "thread-limit: " << ::threadLimit << endl;
auto fillmatrix = []( Matrix &mtx )
{
vector<thread> threads;
size_t n = mtx.dim();
auto mtxRndThread = [&mtx, n]( size_t begin, size_t end )
{
static thread_local random_device rd;
static thread_local uniform_real_distribution<double> urd(
0.0, 1.0 );
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
mtx[i][j] = urd( rd );
};
double part = (double)n / ::threadLimit,
p = 0.0;
for( unsigned t = 0; ; ++t, p += part )
{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;
if( t == ::threadLimit - 1 )
{
mtxRndThread( begin, n );
break;
}
threads.emplace_back( mtxRndThread, begin, end );
}
for( thread &thr : threads )
thr.join();
};
fillmatrix( mtxA );
fillmatrix( mtxB );
mtxC = move( mtxA * mtxB );
return EXIT_SUCCESS;
}

For the Windows-users: here's a little program to measure the runtime of
other applications similar to Unix' time:

#include <Windows.h>
#include <string>
#include <cstring>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <iomanip>

using namespace std;

string FormatClockCycles( ULONG64 ul64ClockCycles );

int wmain( int argc, wchar_t **argv )
{
if( argc < 2 )
return printf( "can't create process\n" ), EXIT_FAILURE;

wstring commandline;
for( size_t i = 1; i != (size_t)argc; commandline += (i !=
(size_t)argc - 1) ? L" " : L"", ++i )
if( wcschr( argv[i], L' ' ) == nullptr )
commandline += argv[i];
else
commandline += L"\"",
commandline += argv[i],
commandline += L"\"";

STARTUPINFOW si;
PROCESS_INFORMATION pi;
memset( &si, 0, sizeof si );
si.cb = sizeof si;
commandline += L'\0';
if( !CreateProcessW( nullptr, (LPWSTR)commandline.c_str(), nullptr,
nullptr, FALSE, 0, nullptr, nullptr, &si, &pi ) )
return printf( "can't create process\n" ), EXIT_FAILURE;
WaitForSingleObject( pi.hProcess, INFINITE );
FILETIME creationTime,
exitTime,
userFTime,
sysFTime;
GetProcessTimes( pi.hProcess, &creationTime, &exitTime, &sysFTime,
&userFTime );
ULONG64 processCycles;
QueryProcessCycleTime( pi.hProcess, &processCycles );

double realTime,
userTime,
sysTime;
realTime = ((LONGLONG &)exitTime - (LONGLONG &)creationTime) /
10'000.0;
userTime = (LONGLONG &)userFTime /
10'000.0;
sysTime = (LONGLONG &)sysFTime /
10'000.0;
string clockCycles = FormatClockCycles( processCycles );
printf( "real %.2lfms\n", realTime );
printf( "user %.2lfms\n", userTime );
printf( "sys %.2lfms\n", sysTime );
printf( "cycles %s\n", clockCycles.c_str() );

return EXIT_SUCCESS;
}

string FormatClockCycles( ULONG64 ul64ClockCycles )
{
string str;
ostringstream ss;
while( ul64ClockCycles )
ss.str( "" ),
ss << setw( 3 ) << setfill( '0' ) << (unsigned)(ul64ClockCycles
% 1'000u),
str = ss.str() + (str != "" ? "." : "") + str,
ul64ClockCycles /= 1'000u;
return str;
}

I compiled the lattter tool to timep.exe. So I testet my code with:
start /b /wait /high timep matrix 64 8
and
start /b /wait /high timep matrix 64 16
to test the code with 8 and 16 threads.
So it would be nice if someone here would test the matrix-code on his
smt-enabled cpu with the full and the half number of threads to see
if it would get the same thrashing-problems.

Paavo Helde

unread,

Nov 4, 2019, 7:20:41 AM11/4/19

to

On 4.11.2019 11:49, Bonita Montero wrote:

> I wrote a little program that does a parallel matrix-multiplication.

> ...

> for( size_t k = 0; k != n; ++k )
> mtx[i][j] += a[i][k] * b[k][j];

Before you start with parallelizing large matrix multiplication, maybe
you should first learn how to do this efficiently in a single thread.

Hint: consider what happens when each next k in the above innermost loop
misses the various CPU caches.

Hint 2: realize that if written in the above form, this cache trashing
will inevitable happen if the matrixes are large enough.

Hint 3: there is a simple little trick called "transpose" which can fix
this problem.

Bonita Montero

unread,

Nov 4, 2019, 7:28:19 AM11/4/19

to

> Before you start with parallelizing large matrix multiplication, maybe
> you should first learn how to do this efficiently in a single thread.

> Hint: consider what happens when each next k in the above innermost loop
> misses the various CPU caches.

k is incrementing vertically for b[k][j]; this might not be good, but
the b[k]'s will be in the cache for b[k + 1] ... if n isn't too large,
which should be common.

Bonita Montero

unread,

Nov 4, 2019, 9:15:57 AM11/4/19

to

>> Before you start with parallelizing large matrix multiplication, maybe
>> you should first learn how to do this efficiently in a single thread.

>> Hint: consider what happens when each next k in the above innermost
>> loop misses the various CPU caches.

Ok, for large matrices where a vertical stride to b doesn't fit in
the L2-cache you're right. On a multiplication of a 64MB matrix with
a 64MB transposed matrix I get s auperlinear speedup of 8,4* with 8
threads over one thread and a speedup of 14,5* with 16 threads; so
that's a speedup throuh SMT of 73%!

#include <iostream>
#include <vector>
#include <random>
#include <stdexcept>
#include <utility>

#include <thread>
#include <cstdlib>
#include <cmath>

using namespace std;

unsigned threadLimit = 0;

struct TransposedMatrix;

struct Matrix
{
Matrix() = default;
Matrix( size_t n );
Matrix( Matrix const &other );
Matrix( Matrix &&other );
double *operator []( size_t i ) const;
Matrix &operator =( Matrix const &other );
Matrix &operator =( Matrix &&other );

TransposedMatrix &transpose();
Matrix &resize( size_t n );

size_t dim() const;

private:
void init( size_t n );
void copy( Matrix const &other );
vector<double> m_mtx;
vector<double *> m_ptrs;
};

struct TransposedMatrix : public Matrix
{
using Matrix::Matrix;
using Matrix::operator [];
using Matrix::dim;
TransposedMatrix &operator =( TransposedMatrix const &other );
TransposedMatrix &operator =( TransposedMatrix &&other );
TransposedMatrix &resize( size_t n );

};

inline
Matrix::Matrix( size_t n )
{
init( n );
}

inline
Matrix::Matrix( Matrix const &other )
{
copy( other );
}

inline
Matrix::Matrix( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
}

inline
double *Matrix::operator []( size_t i ) const
{
return m_ptrs[i];
}

Matrix &Matrix::operator =( Matrix const &other )
{
copy( other );
return *this;
}

inline
Matrix &Matrix::operator =( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
return *this;
}

TransposedMatrix &Matrix::transpose()
{
size_t n = dim();

for( size_t i = 0; i != n; ++i )

for( size_t j = 0; j != i ; ++j )
swap( m_ptrs[i][j], m_ptrs[j][i] );
return *(TransposedMatrix *)this;

}

inline
size_t Matrix::dim() const
{
return m_ptrs.size();
}

inline
void Matrix::init( size_t n )
{
m_mtx.resize( n * n, 0.0 );
m_ptrs.resize( n, nullptr );
for( size_t i = 0, j = 0; i != n; ++i, j += n )
m_ptrs[i] = &m_mtx[j];
}

inline
void Matrix::copy( Matrix const &other )
{
size_t n = other.m_ptrs.size();
init( n );
for( size_t i = 0; i != n; ++i )
for( size_t j = 0; j != n; ++j )
m_ptrs[i][j] = other.m_ptrs[i][j];
}

Matrix operator *( Matrix const &a, TransposedMatrix const &b )
{
size_t n;
if( (n = a.dim()) == 0 )
return move( Matrix() );
if( n != b.dim() )

throw invalid_argument( "unequal matrix-dimensions" );
Matrix mtx( n );

auto mtxMultThread = [&a, &b, &mtx, n]( size_t begin, size_t end )

{
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
{
mtx[i][j] = 0.0;

for( size_t k = 0; k != n; ++k )

mtx[i][j] += a[i][k] * b[j][k];

}
};
vector<thread> threads;
double part = (double)n / ::threadLimit,
p = 0.0;

for( unsigned t = 0; ; p += part )

{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;

if( t++ == ::threadLimit - 1 )
{
mtxMultThread( begin, n );
break;
}
threads.emplace_back( mtxMultThread, begin, end );

}
for( thread &thr : threads )
thr.join();
return move( mtx );
}

inline
TransposedMatrix &TransposedMatrix::operator =( TransposedMatrix const
&other )
{
return (TransposedMatrix &)(*(Matrix *)this = (Matrix &)other);
}

inline
TransposedMatrix &TransposedMatrix::operator =( TransposedMatrix &&other )
{
return (TransposedMatrix &)(*(Matrix *)this = move( (Matrix &)other ));
}

inline
TransposedMatrix &TransposedMatrix::resize( size_t n )
{
Matrix::resize( n );
return *this;

}

int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
size_t mtxDim = (size_t)sqrt( (size_t)(unsigned)atoi( argv[1]
) * 1024 * 1024 / 8 );

Matrix mtxA( mtxDim );
TransposedMatrix tmtxB( mtxDim );
Matrix mtxC;

cout << "matrix-dimension: " << mtxDim << endl;
if( (::threadLimit = thread::hardware_concurrency()) == 0 )
::threadLimit = 1;
if( argc >= 3 )
{
unsigned tlParam = (unsigned)atoi( argv[2] );
tlParam += tlParam == 0;
if( tlParam < (::threadLimit = thread::hardware_concurrency()) )
::threadLimit = tlParam;
}
cout << "thread-limit: " << ::threadLimit << endl;
auto fillmatrix = []( Matrix &mtx )
{

size_t n = mtx.dim();
if( n == 0 )
return;
vector<thread> threads;

auto mtxRndThread = [&mtx, n]( size_t begin, size_t end )
{
static thread_local random_device rd;
static thread_local uniform_real_distribution<double> urd(
0.0, 1.0 );
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
mtx[i][j] = urd( rd );
};
double part = (double)n / ::threadLimit,
p = 0.0;

for( unsigned t = 0; ; p += part )

{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;

if( t++ == ::threadLimit - 1 )

{
mtxRndThread( begin, n );
break;
}
threads.emplace_back( mtxRndThread, begin, end );
}
for( thread &thr : threads )
thr.join();
};
fillmatrix( mtxA );

fillmatrix( tmtxB );
mtxC = move( mtxA * tmtxB );
return EXIT_SUCCESS;
}

Bonita Montero

unread,

Nov 4, 2019, 9:23:30 AM11/4/19

to

And this has an aliasing-problem:

>     auto   mtxMultThread = [&a, &b, &mtx, n]( size_t begin, size_t end )
>     {
>         for( size_t i = begin; i != end; ++i )
>             for( size_t j = 0; j != n; ++j )
>             {
>                 mtx[i][j] = 0.0;
>                 for( size_t k = 0; k != n; ++k )
>                     mtx[i][j] += a[i][k] * b[j][k];
>             }
>     };

auto mtxMultThread = [&a, &b, &mtx, n]( size_t begin, size_t end )
{
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
{

double s = 0.0;

for( size_t k = 0; k != n; ++k )

s += a[i][k] * b[j][k];
mtx[i][j] = s;
}
};

With the changed code the 64MB-benchmark runs over three times faster.

Christian Gollwitzer

unread,

Nov 4, 2019, 10:06:33 AM11/4/19

to

Am 04.11.19 um 13:20 schrieb Paavo Helde:

Hint 4: Others have already solved it. If you download a good BLAS with
optimizations for your system, e.g. OpenBLAS, or the one from Intel, you
may find that a simple call to "dgemm" may well be 1000x faster than the
code you have.

Christian

Bonita Montero

unread,

Nov 4, 2019, 10:17:08 AM11/4/19

to

> Hint 4: Others have already solved it. If you download a good BLAS with
> optimizations for your system, e.g. OpenBLAS, or the one from Intel, you
> may find that a simple call to "dgemm" may well be 1000x faster than the
> code you have.

I'm coding it just for fun and not to solve a real issue.

Chris M. Thomasson

unread,

Nov 4, 2019, 4:57:38 PM11/4/19

to

No problem with that. However, try using the GPU for fun. Just to see
the difference.