--
Parallel::Loops can be nice for simple tasks on multicore machines.
Anything related to OpenMP or threading?
Brett
#!/usr/bin/env perl
##
## Demonstration of parallelizing PDL with Many-core Engine for Perl (MCE)
## using Strassen's algorithm for matrix multiplication.
##
## Requires MCE version 1.4 or later to run.
## http://code.google.com/p/many-core-engine-perl/
##
## MCE is my personal project. I'm new to PDL and wanted to see if PDL + MCE
## can be combined to maximize on available cores. I had no idea what to
## expect and was pleasantly surprised. The 1.4 release adds the send method.
##
## PDL is extremely powerful by itself. However, add MCE to it and be amazed.
##
## Regards,
## Mario Roy
##
use strict;
use warnings;
use FindBin;
use lib "$FindBin::Bin/lib";
use Time::HiRes qw(time);
use PDL;
use PDL::IO::Storable; ## Required for PDL + MCE combo
use MCE::Signal qw(-use_dev_shm);
use MCE;
my $tam = 2048; ## Size 2048x2048 (power of 2 only)
my $a = sequence $tam,$tam;
my $b = sequence $tam,$tam;
my $c = zeroes $tam,$tam;
my $max_parallel_level = 1; ## Levels deep to parallelize
my @p = ( ); ## For MCE results - must be global
my $start = time();
strassen($a, $b, $c, $tam); ## Start matrix multiplication
my $end = time();
# print $c;
printf STDERR "\n## Compute time: %0.03f (secs)\n\n", $end - $start;
##
sub strassen {
my $a = $_[0]; my $b = $_[1]; my $c = $_[2]; my $tam = $_[3];
my $level = $_[4] || 0;
## Perform the classic multiplication when matrix is <= 128 X 128
if ($tam <= 128) {
# for my $i (0 .. $tam - 1) { ## Perl arrays
# for my $j (0 .. $tam - 1) {
# $c->[$i][$j] = 0;
# for my $k (0 .. $tam - 1) {
# $c->[$i][$j] += $a->[$i][$k] * $b->[$k][$j];
# }
# }
# }
ins(inplace($c), $a x $b); ## PDL
return;
}
## Otherwise, perform multiplication using Strassen's algorithm
my ($mce, $p1, $p2, $p3, $p4, $p5, $p6, $p7);
my $nTam = $tam / 2;
if (++$level <= $max_parallel_level) {
## Configure and spawn MCE workers early
sub store_result {
my ($n, $result) = @_;
$p[$n] = $result;
}
$mce = MCE->new(
max_workers => 7,
user_tasks => [{
user_func => sub {
my $self = $_[0];
my $data = $self->{user_data};
my $result = zeroes $nTam,$nTam;
strassen($data->[0], $data->[1], $result, $data->[3], $level);
$self->do('store_result', $data->[2], $result);
},
task_end => sub {
$p1 = $p[1]; $p2 = $p[2]; $p3 = $p[3]; $p4 = $p[4];
$p5 = $p[5]; $p6 = $p[6]; $p7 = $p[7];
@p = ( );
}
}]
);
$mce->spawn();
}
## Allocate memory after spawning MCE workers
my $a11 = zeroes $nTam,$nTam; my $a12 = zeroes $nTam,$nTam;
my $a21 = zeroes $nTam,$nTam; my $a22 = zeroes $nTam,$nTam;
my $b11 = zeroes $nTam,$nTam; my $b12 = zeroes $nTam,$nTam;
my $b21 = zeroes $nTam,$nTam; my $b22 = zeroes $nTam,$nTam;
my $t1 = zeroes $nTam,$nTam; my $t2 = zeroes $nTam,$nTam;
$p1 = zeroes $nTam,$nTam; $p2 = zeroes $nTam,$nTam;
$p3 = zeroes $nTam,$nTam; $p4 = zeroes $nTam,$nTam;
$p5 = zeroes $nTam,$nTam; $p6 = zeroes $nTam,$nTam;
$p7 = zeroes $nTam,$nTam;
## Divide the matrices into 4 sub-matrices
divide_m($a11, $a12, $a21, $a22, $a, $nTam);
divide_m($b11, $b12, $b21, $b22, $b, $nTam);
## Calculate p1 to p7
if ($level <= $max_parallel_level) {
sum_m($a11, $a22, $t1, $nTam);
sum_m($b11, $b22, $t2, $nTam);
$mce->send([ $t1, $t2, 1, $nTam ]);
sum_m($a21, $a22, $t1, $nTam);
$mce->send([ $t1, $b11, 2, $nTam ]);
subtract_m($b12, $b22, $t2, $nTam);
$mce->send([ $a11, $t2, 3, $nTam ]);
subtract_m($b21, $b11, $t2, $nTam);
$mce->send([ $a22, $t2, 4, $nTam ]);
sum_m($a11, $a12, $t1, $nTam);
$mce->send([ $t1, $b22, 5, $nTam ]);
subtract_m($a21, $a11, $t1, $nTam);
sum_m($b11, $b12, $t2, $nTam);
$mce->send([ $t1, $t2, 6, $nTam ]);
subtract_m($a12, $a22, $t1, $nTam);
sum_m($b21, $b22, $t2, $nTam);
$mce->send([ $t1, $t2, 7, $nTam ]);
$mce->run();
} else {
sum_m($a11, $a22, $t1, $nTam);
sum_m($b11, $b22, $t2, $nTam);
strassen($t1, $t2, $p1, $nTam, $level);
sum_m($a21, $a22, $t1, $nTam);
strassen($t1, $b11, $p2, $nTam, $level);
subtract_m($b12, $b22, $t2, $nTam);
strassen($a11, $t2, $p3, $nTam, $level);
subtract_m($b21, $b11, $t2, $nTam);
strassen($a22, $t2, $p4, $nTam, $level);
sum_m($a11, $a12, $t1, $nTam);
strassen($t1, $b22, $p5, $nTam, $level);
subtract_m($a21, $a11, $t1, $nTam);
sum_m($b11, $b12, $t2, $nTam);
strassen($t1, $t2, $p6, $nTam, $level);
subtract_m($a12, $a22, $t1, $nTam);
sum_m($b21, $b22, $t2, $nTam);
strassen($t1, $t2, $p7, $nTam, $level);
}
## Calculate and group into a single matrix $c
calc_m($p1, $p2, $p3, $p4, $p5, $p6, $p7, $c, $nTam);
return;
}
sub divide_m {
my $m11 = $_[0]; my $m12 = $_[1]; my $m21 = $_[2]; my $m22 = $_[3];
my $m = $_[4]; my $tam = $_[5];
# for my $i (0 .. $tam - 1) { ## Perl arrays
# for my $j (0 .. $tam - 1) {
# $m11->[$i][$j] = $m->[$i][$j];
# $m12->[$i][$j] = $m->[$i][$j + $tam];
# $m21->[$i][$j] = $m->[$i + $tam][$j];
# $m22->[$i][$j] = $m->[$i + $tam][$j + $tam];
# }
# }
my $n1 = $tam - 1; ## PDL
my $n2 = $tam + $n1;
ins(inplace($m11), $m->slice("0:$n1,0:$n1"));
ins(inplace($m12), $m->slice("$tam:$n2,0:$n1"));
ins(inplace($m21), $m->slice("0:$n1,$tam:$n2"));
ins(inplace($m22), $m->slice("$tam:$n2,$tam:$n2"));
return;
}
sub calc_m {
my $p1 = $_[0]; my $p2 = $_[1]; my $p3 = $_[2]; my $p4 = $_[3];
my $p5 = $_[4]; my $p6 = $_[5]; my $p7 = $_[6]; my $c = $_[7];
my $tam = $_[8];
my $c11 = zeroes $tam,$tam; my $c12 = zeroes $tam,$tam;
my $c21 = zeroes $tam,$tam; my $c22 = zeroes $tam,$tam;
my $t1 = zeroes $tam,$tam; my $t2 = zeroes $tam,$tam;
sum_m($p1, $p4, $t1, $tam);
sum_m($t1, $p7, $t2, $tam);
subtract_m($t2, $p5, $c11, $tam);
sum_m($p3, $p5, $c12, $tam);
sum_m($p2, $p4, $c21, $tam);
sum_m($p1, $p3, $t1, $tam);
sum_m($t1, $p6, $t2, $tam);
subtract_m($t2, $p2, $c22, $tam);
# for my $i (0 .. $tam - 1) { ## Perl arrays
# for my $j (0 .. $tam - 1) {
# $c->[$i][$j] = $c11->[$i][$j];
# $c->[$i][$j + $tam] = $c12->[$i][$j];
# $c->[$i + $tam][$j] = $c21->[$i][$j];
# $c->[$i + $tam][$j + $tam] = $c22->[$i][$j];
# }
# }
ins(inplace($c), $c11, 0, 0); ## PDL
ins(inplace($c), $c12, $tam, 0);
ins(inplace($c), $c21, 0, $tam);
ins(inplace($c), $c22, $tam, $tam);
return;
}
sub sum_m {
my $a = $_[0]; my $b = $_[1]; my $r = $_[2]; my $tam = $_[3];
# for my $i (0 .. $tam - 1) { ## Perl arrays
# for my $j (0 .. $tam - 1) {
# $r->[$i][$j] = $a->[$i][$j] + $b->[$i][$j];
# }
# }
ins(inplace($r), $a + $b); ## PDL
return;
}
sub subtract_m {
my $a = $_[0]; my $b = $_[1]; my $r = $_[2]; my $tam = $_[3];
# for my $i (0 .. $tam - 1) { ## Perl arrays
# for my $j (0 .. $tam - 1) {
# $r->[$i][$j] = $a->[$i][$j] - $b->[$i][$j];
# }
# }
ins(inplace($r), $a - $b); ## PDL
return;
}
## matmult_pdl_b.pl 32: compute time: 0.000 secs
## matmult_pdl_thr.pl 32: compute time: 0.136 secs
## matmult_pdl_m.pl 32: compute time: 0.188 secs
## strassen_pdl_m.pl 32: compute time: 0.000 secs
## matmult_pdl_b.pl 64: compute time: 0.001 secs
## matmult_pdl_thr.pl 64: compute time: 0.138 secs
## matmult_pdl_m.pl 64: compute time: 0.109 secs
## strassen_pdl_m.pl 64: compute time: 0.001 secs
## matmult_pdl_b.pl 128: compute time: 0.007 secs
## matmult_pdl_thr.pl 128: compute time: 0.139 secs
## matmult_pdl_m.pl 128: compute time: 0.141 secs
## strassen_pdl_m.pl 128: compute time: 0.007 secs
## matmult_pdl_b.pl 256: compute time: 0.053 secs
## matmult_pdl_thr.pl 256: compute time: 0.152 secs
## matmult_pdl_m.pl 256: compute time: 0.499 secs
## strassen_pdl_m.pl 256: compute time: 0.040 secs
## matmult_pdl_b.pl 512: compute time: 0.431 secs
## matmult_pdl_thr.pl 512: compute time: 0.222 secs
## matmult_pdl_m.pl 512: compute time: 2.046 secs
## strassen_pdl_m.pl 512: compute time: 0.147 secs
## matmult_pdl_b.pl 1024: compute time: 3.605 secs
## matmult_pdl_thr.pl 1024: compute time: 0.965 secs
## matmult_pdl_m.pl 1024: compute time: 8.530 secs
## strassen_pdl_m.pl 1024: compute time: 0.801 secs
## matmult_pdl_b.pl 2048: compute time: 29.346 secs
## matmult_pdl_thr.pl 2048: compute time: 6.233 secs
## matmult_pdl_m.pl 2048: compute time: 37.156 secs
## strassen_pdl_m.pl 2048: compute time: 4.927 secs
## matmult_pdl_b.pl 4096: compute time: 234.716 secs
## matmult_pdl_thr.pl 4096: compute time: 64.579 secs
## matmult_pdl_m.pl 2048: -- took too long --
## strassen_pdl_m.pl 4096: -- took too long --
# uses PDL::NiceSlice;
$piddle(5:10) .= $new_values;
$piddle->ins($new_values, 5);
Hi David,That was a great post comparing the two. I'm new to PDL, just started about 3 weeks ago. I can try a couple of things.
Can you post the matmul_pdl_thr.pl source somewhere where I can look at it. That will be quite helpful.
It's a definite yes on the challenge for the sole reason on wanting to help scientists out there wanting more parallelism with PDL. :)Thanks,Mario
--
---
You received this message because you are subscribed to the Google Groups "The Quantified Onion" group.
To unsubscribe from this group and stop receiving emails from it, send an email to the-quantified-o...@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.
--
my (@mce_a, $lvl);
if ($tam > 128) {
$lvl = 2; $mce_a[$_] = configure_and_spawn_mce() for (1 .. 7);
$lvl = 1; $mce_a[ 0] = configure_and_spawn_mce();
}
...
sub strassen_r {
...
if ($tam <= 128) {
ins(inplace($c), $a x $b);
return;
}
elsif ($lvl < 2 && defined $mce) {
strassen($a, $b, $c, $tam, $mce_a[ $mce->wid ]);
return;
}
...
}