Nickolay Kolchin <
nbko...@gmail.com> writes:
>Inability to use hardware at full due to language limitations
>(SIMD) is the third.
It seems to me that Forth sits in the same boat here as everybody
else. No language has standardized SIMD extensions; Fortran has the
array sub-language, but unfortunately it works on arrays (with alias
problems), not on some opaque type, so it's hardly easier to compile
than auto-vectorizing loops (and indeed, I hear that gfortran compiles
the array sublanguage into loops and then hopes that the
auto-vectorizer manages to vectorize them). GCC has vector types, but
they are limited (e.g., fixed-size) and seem to receive little love
from the gcc maintainers (after all, they are not used in the relevant
benchmarks).
I have done some work on using SIMD in Forth, but it has not gotten
into the production stage yet:
@InProceedings{ertl17,
author = {M. Anton Ertl},
title = {{SIMD} and Vectors},
crossref = {euroforth17},
pages = {25--36},
url = {
http://www.euroforth.org/ef17/papers/ertl.pdf},
video = {
https://wiki.forth-ev.de/lib/exe/fetch.php/events:ef2017:simd-vectors.mp4},
OPTnote = {refereed},
abstract = {Many programs have parts with significant data
parallelism, and many CPUs provide SIMD instructions
for processing data-parallel parts faster. The weak
link in this chain is the programming language. We
propose a vector wordset so that Forth programmers
can make use of SIMD instructions to speed up the
data-parallel parts of their applications. The
vector wordset uses a separate vector stack
containing opaque vectors with run-time determined
length. Preliminary results using one benchmark
show a factor~8 speedup of a simple vector
implementation over scalar Gforth code, a smaller
(factor 1.8) speedup over scalar VFX code; another
factor of 3 is possible on this benchmark with a
more sophisticated implementation. However, vectors
have an overhead; this overhead is amortized in this
benchmark at vector lengths between 3 and 250
(depending on which variants we compare).}
}
@Proceedings{euroforth17,
title = {33rd EuroForth Conference},
booktitle = {33rd EuroForth Conference},
year = {2017},
key = {EuroForth'17},
url = {
http://www.complang.tuwien.ac.at/anton/euroforth/ef17/papers/proceedings.pdf}
}
@InProceedings{ertl18manlang,
author = {M. Anton Ertl},
title = {Software Vector Chaining},
booktitle = {15th International Conference on Managed Languages &
Runtimes (Manlang'18)},
year = {2018},
pages = {Article-18},
url = {
http://www.complang.tuwien.ac.at/papers/ertl18manlang.pdf},
doi = {10.1145/3237009.3237021},
abstract = {Providing vectors of run-time determined length as
opaque value types is a good interface between the
machine-level SIMD instructions and portable
application-oriented programming languages.
Implementing vector operations requires a loop that
breaks the vector into SIMD-register-sized chunks.
A compiler can fuse the loops of several vector
operations together. However, during normal
compilation this is only easy if no other control
structures are involved. This paper explores an
alternative: collect a trace of vector operations at
run-time (following the program control flow during
this collecting step), and then perform the combined
vector loop. This arrangement has a certain
run-time overhead, but its implementation is simpler
and can happen independently, in a library.
Preliminary performance results indicate that the
overhead makes this approach beneficial only for
long vectors ($>1$KB). For shorter vectors, unfused
loops should be used in a library setting.
Fortunately, this choice can be made at run time,
individually for each vector operation.}
}
@InProceedings{ertl18chaining,
author = {M. Anton Ertl},
title = {Software Vector Chaining},
crossref = {euroforth18},
pages = {54-55},
url = {
http://www.euroforth.org/ef18/papers/ertl-chaining.pdf},
url-slides = {
http://www.euroforth.org/ef18/papers/ertl-chaining-slides.pdf},
video = {
https://wiki.forth-ev.de/doku.php/events:ef2018:vectors},
OPTnote = {presentation slides, paper published at Manlang'18},
abstract = {Providing vectors of run-time determined length as
opaque value types is a good interface between the
machine-level SIMD instructions and portable
application-oriented programming languages.
Implementing vector operations requires a loop that
breaks the vector into SIMD-register-sized chunks.
A compiler can fuse the loops of several vector
operations together. However, during normal
compilation this is only easy if no other control
structures are involved. This paper explores an
alternative: collect a trace of vector operations at
run-time (following the program control flow during
this collecting step), and then perform the combined
vector loop. This arrangement has a certain
run-time overhead, but its implementation is simpler
and can happen independently, in a library.
Preliminary performance results indicate that the
overhead makes this approach beneficial only for
long vectors ($>1$KB). For shorter vectors, unfused
loops should be used in a library setting.
Fortunately, this choice can be made at run time,
individually for each vector operation.}
}
@Proceedings{euroforth18,
title = {34th EuroForth Conference},
booktitle = {34th EuroForth Conference},
year = {2018},
key = {EuroForth'18},
url = {
http://www.euroforth.org/ef18/papers/proceedings.pdf}
}
Github page:
https://github.com/AntonErtl/vectors