>> Before you start with parallelizing large matrix multiplication, maybe
>> you should first learn how to do this efficiently in a single thread.
>> Hint: consider what happens when each next k in the above innermost
>> loop misses the various CPU caches.
Ok, for large matrices where a vertical stride to b doesn't fit in
the L2-cache you're right. On a multiplication of a 64MB matrix with
a 64MB transposed matrix I get s auperlinear speedup of 8,4* with 8
threads over one thread and a speedup of 14,5* with 16 threads; so
that's a speedup throuh SMT of 73%!
#include <iostream>
#include <vector>
#include <random>
#include <stdexcept>
#include <utility>
#include <thread>
#include <cstdlib>
#include <cmath>
using namespace std;
unsigned threadLimit = 0;
struct TransposedMatrix;
struct Matrix
{
Matrix() = default;
Matrix( size_t n );
Matrix( Matrix const &other );
Matrix( Matrix &&other );
double *operator []( size_t i ) const;
Matrix &operator =( Matrix const &other );
Matrix &operator =( Matrix &&other );
TransposedMatrix &transpose();
Matrix &resize( size_t n );
size_t dim() const;
private:
void init( size_t n );
void copy( Matrix const &other );
vector<double> m_mtx;
vector<double *> m_ptrs;
};
struct TransposedMatrix : public Matrix
{
using Matrix::Matrix;
using Matrix::operator [];
using Matrix::dim;
TransposedMatrix &operator =( TransposedMatrix const &other );
TransposedMatrix &operator =( TransposedMatrix &&other );
TransposedMatrix &resize( size_t n );
};
inline
Matrix::Matrix( size_t n )
{
init( n );
}
inline
Matrix::Matrix( Matrix const &other )
{
copy( other );
}
inline
Matrix::Matrix( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
}
inline
double *Matrix::operator []( size_t i ) const
{
return m_ptrs[i];
}
Matrix &Matrix::operator =( Matrix const &other )
{
copy( other );
return *this;
}
inline
Matrix &Matrix::operator =( Matrix &&other )
{
m_mtx = move( other.m_mtx );
m_ptrs = move( other.m_ptrs );
return *this;
}
TransposedMatrix &Matrix::transpose()
{
size_t n = dim();
for( size_t i = 0; i != n; ++i )
for( size_t j = 0; j != i ; ++j )
swap( m_ptrs[i][j], m_ptrs[j][i] );
return *(TransposedMatrix *)this;
}
inline
size_t Matrix::dim() const
{
return m_ptrs.size();
}
inline
void Matrix::init( size_t n )
{
m_mtx.resize( n * n, 0.0 );
m_ptrs.resize( n, nullptr );
for( size_t i = 0, j = 0; i != n; ++i, j += n )
m_ptrs[i] = &m_mtx[j];
}
inline
void Matrix::copy( Matrix const &other )
{
size_t n = other.m_ptrs.size();
init( n );
for( size_t i = 0; i != n; ++i )
for( size_t j = 0; j != n; ++j )
m_ptrs[i][j] = other.m_ptrs[i][j];
}
Matrix operator *( Matrix const &a, TransposedMatrix const &b )
{
size_t n;
if( (n = a.dim()) == 0 )
return move( Matrix() );
if( n != b.dim() )
throw invalid_argument( "unequal matrix-dimensions" );
Matrix mtx( n );
auto mtxMultThread = [&a, &b, &mtx, n]( size_t begin, size_t end )
{
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
{
mtx[i][j] = 0.0;
for( size_t k = 0; k != n; ++k )
mtx[i][j] += a[i][k] * b[j][k];
}
};
vector<thread> threads;
double part = (double)n / ::threadLimit,
p = 0.0;
for( unsigned t = 0; ; p += part )
{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;
if( t++ == ::threadLimit - 1 )
{
mtxMultThread( begin, n );
break;
}
threads.emplace_back( mtxMultThread, begin, end );
}
for( thread &thr : threads )
thr.join();
return move( mtx );
}
inline
TransposedMatrix &TransposedMatrix::operator =( TransposedMatrix const
&other )
{
return (TransposedMatrix &)(*(Matrix *)this = (Matrix &)other);
}
inline
TransposedMatrix &TransposedMatrix::operator =( TransposedMatrix &&other )
{
return (TransposedMatrix &)(*(Matrix *)this = move( (Matrix &)other ));
}
inline
TransposedMatrix &TransposedMatrix::resize( size_t n )
{
Matrix::resize( n );
return *this;
}
int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
size_t mtxDim = (size_t)sqrt( (size_t)(unsigned)atoi( argv[1]
) * 1024 * 1024 / 8 );
Matrix mtxA( mtxDim );
TransposedMatrix tmtxB( mtxDim );
Matrix mtxC;
cout << "matrix-dimension: " << mtxDim << endl;
if( (::threadLimit = thread::hardware_concurrency()) == 0 )
::threadLimit = 1;
if( argc >= 3 )
{
unsigned tlParam = (unsigned)atoi( argv[2] );
tlParam += tlParam == 0;
if( tlParam < (::threadLimit = thread::hardware_concurrency()) )
::threadLimit = tlParam;
}
cout << "thread-limit: " << ::threadLimit << endl;
auto fillmatrix = []( Matrix &mtx )
{
size_t n = mtx.dim();
if( n == 0 )
return;
vector<thread> threads;
auto mtxRndThread = [&mtx, n]( size_t begin, size_t end )
{
static thread_local random_device rd;
static thread_local uniform_real_distribution<double> urd(
0.0, 1.0 );
for( size_t i = begin; i != end; ++i )
for( size_t j = 0; j != n; ++j )
mtx[i][j] = urd( rd );
};
double part = (double)n / ::threadLimit,
p = 0.0;
for( unsigned t = 0; ; p += part )
{
size_t begin = (size_t)p,
end = (size_t)(p + part);
if( begin == end )
continue;
if( t++ == ::threadLimit - 1 )
{
mtxRndThread( begin, n );
break;
}
threads.emplace_back( mtxRndThread, begin, end );
}
for( thread &thr : threads )
thr.join();
};
fillmatrix( mtxA );
fillmatrix( tmtxB );
mtxC = move( mtxA * tmtxB );
return EXIT_SUCCESS;
}