Robert Wessel <
robert...@yahoo.com> writes:
>Stack based architectures have died out for a reason, they tend to
>make good code optimization very difficult, since the top-of-stack is
>always a hotspot.
Well, looking at register machine implementations, they actually turn
their register code internally into the same kind of hot-spot code:
They recognize of one instruction writes to a register and the next
reads from it, and optimize this case with forwarding. You can also
have an accumulator or stack machine and make this aspect explicit in
the instruction encoding. Of course, for the cases when you want to
access other data, you would use, e.g., registers or a belt.
There was a very interesting paper [kim&smith02] that plays with the
idea of organizing the code microarchitecturally into strands of
instructions connected by an accumulator. A later paper
[salverda&zilles07] found disadvantages of that model and provides
insights into what works and what doesn't. I wonder if making the
strands architectural would reduce the problems and provide benefits
over classical register architectures.
@InProceedings{kim&smith02,
author = {Ho-Seop Kim and James E. Smith},
title = {An Instruction Set and Microarchitecture for
Instruction Level Distributed Processing},
crossref = {isca02},
pages = {71--81},
url = {
http://www.ece.wisc.edu/~hskim/papers/kimh_ildp.pdf},
annote = {This paper addresses the problems of wide
superscalars with communication across the chip and
the number of write ports in the register file. The
authors propose an architecture (ILDP) with
general-purpose registers and with accumulators
(with instructions only accessing one accumulator
(read and/or write) and one register (read or
write); for the accumulators their death is
specified explicitly in the instructions. The
microarchitecture builds \emph{strands} from
instructions working on an accumulator; a strand
starts with an instruction writing to an accumulator
without reading from it, continues with instructions
reading from (and possibly writing to) the
accumulator and ends with an instruction that kills
the accumulator. Strands are allocated to one out of
eight processing elements (PEs) dynamically (i.e.,
accumulators are renamed). A PE consists of
mainly one ALU data path (but also a copy of the
GPRs and an L1 cache). They evaluated this
architecture by translating Alpha binaries into it,
and comparing their architecture to a 4-wide or
8-wide Alpha implementation; their architecture has
a lower L1 cache latency, though. The performance of
ILDP in clock cycles is competetive, and one can
expect faster clocks for ILDP. The paper also
presents data for other stuff, e.g. general-purpose
register writes, which have to be promoted between
strands and which are relatively few.}
}
@Proceedings{isca02,
title = "$29^\textit{th}$ Annual International Symposium on Computer Architecture",
booktitle = "$29^\textit{th}$ Annual International Symposium on Computer Architecture",
year = "2002",
key = "ISCA 29",
}
@InProceedings{salverda&zilles07,
author = {Pierre Salverda and Craig Zilles},
title = {Dependence-Based Scheduling Revisited: A Tale of Two
Baselines},
booktitle = {Sixth Annual Workshop on Duplicating, Deconstructing, and Debunking (WDDD 2007)},
year = {2007},
url = {
http://www.ece.wisc.edu/~wddd/2007/papers/wddd_01.pdf},
url2 = {
http://www-sal.cs.uiuc.edu/~zilles/papers/lanes.wddd-2007.pdf},
annote = {When the authors simulated the dependence-based
scheduling work by Palacharla, Kim, and Smith, they
found 30\% lower IPC than a conventional OoO
machine, whereas the original simulations only found
a 5\% lower IPC. The paper analyses the reasons for
this, and provides a number of insights into how
hardware schedulers, execution engines, and various
features in them interact, and why and how
dependence-based scheduling works. The authors'
simulation had a number of significant differences
from the simulation in the original work: it used a
memory disambiguator, 2-cycle load latency (instead
of 1-cycle), and a better branch predictor. These
changes increase the number of strands available at
the same time, and the 8-lane dependence-based
machine becomes lane-limited (and instruction fetch
stalls waiting for a free lane), so it cannot profit
from the improvements or work around the higher
latency, whereas a conventional OoO machine can. 24
lanes would be required to bring the IPC
disadvantage of the dependence-based machine down to
5\% on the authors' simulator. OTOH, by changing
these parts of their simulation to be like the
original work, the dependence-based scheduling only
had an 11\% IPC disadvantage on an 8-lane machine
(much closer to the original 5\%)}
}
- anton
--
M. Anton Ertl Some things have to be seen to be believed
an...@mips.complang.tuwien.ac.at Most things have to be believed to be seen
http://www.complang.tuwien.ac.at/anton/home.html