Sadly, the version of my mark-region prototype that I kept does not work and I haven't found the time to fix it so I'd like to archive it here for future reference and as a guide to anyone interested:
#include <iostream>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
using namespace std;
class Cons {
public:
char x;
char y;
Cons *t;
Cons() { }
Cons(char x, char y, Cons *t) : x(x), y(x), t(t) { }
};
const int size = 16000000;
typedef unsigned char byte;
// One 16kB region
const int regionSize = 14;
int firstFree = 0;
int bitVectorSize = 1 << (regionSize-6);
byte *marked = (byte *)_aligned_malloc(bitVectorSize, 64);
byte *allocated = (byte *)_aligned_malloc(bitVectorSize, 64);
Cons *region = (Cons *)_aligned_malloc(1 << regionSize, 1 << regionSize);
Cons **shadowStackStart = (Cons **)malloc(size * sizeof(Cons *));
Cons **shadowStackPtr = shadowStackStart;
byte *firstUnsetBit = new byte[256];
int findFirstUnsetBit(byte n) {
if ((n & 1) == 0) return 0;
return 1 + findFirstUnsetBit(n >> 1);
}
/// The empty list.
Cons *empty = NULL;
// Using a raw visit stack rather than a std::vector makes marking at least 2x
// faster.
Cons **visitStart = (Cons **)malloc(size * sizeof(Cons *));
Cons **visitPtr = visitStart;
// Double marking is 3% slower
void mark(Cons *r) {
int n = r - region;
if ((marked[n/8] & (1 << (n & 7))) == 0) {
marked[n/8] |= 1 << (n & 7);
if (r->t) *(visitPtr++) = r->t;
}
}
int gcCycles = 0;
void gc() {
++gcCycles;
// Mark the global roots.
for (Cons **r=shadowStackStart; r!=shadowStackPtr; ++r)
mark(*r);
// Mark the entire heap.
while (visitPtr != visitStart)
mark(*(--visitPtr));
// Filter out reachable blocks from the allocated list.
for (int i=0; i<bitVectorSize; ++i) {
allocated[i] &= marked[i];
marked[i] = 0;
}
/*
int size=0;
for (int i=0; i<bitVectorSize; ++i)
for (int j=0; j<8; ++j)
if (((allocated[i] >> j) & 1) == 1) {
++size;
}
else {
Cons *r = region + 8*i + j;
r->x = -1;
r->y = -1;
r->t = (Cons *)-1;
}
//cout << 100-100.0*size/4096 << "% reduction in heap size" << endl;
*/
firstFree = 0;
}
/// Returns a pointer into the current region or NULL if the region is full.
Cons *tryCons(char x, char y, Cons *t) {
for (; firstFree<bitVectorSize; ++firstFree) {
byte b = *(allocated + firstFree);
if (b != 255) {
// Find the first free block
int j = firstUnsetBit[b];
// Mark this block as allocated
*(allocated + firstFree) |= (1 << j);
Cons *r = region + 8*firstFree + j;
r->x = x;
r->y = y;
r->t = t;
return r;
}
}
return empty;
}
void oom() {
cout << "Out of memory " << shadowStackPtr - shadowStackStart << endl;
getc(stdin);
exit(1);
}
// Assume tail is already reachable.
Cons *cons(char x, char y, Cons *t) {
Cons *list = tryCons(x, y, t);
if (list) return list;
gc();
list = tryCons(x, y, t);
if (!list) oom();
return list;
}
Cons *cons(Cons *a, char x, char y, Cons *t) {
Cons *list = tryCons(x, y, t);
if (list) return list;
Cons **oldShadowStackPtr = shadowStackPtr;
if (a) *shadowStackPtr++ = a;
gc();
shadowStackPtr = oldShadowStackPtr;
list = tryCons(x, y, t);
if (!list) oom();
return list;
}
/// Is one position safe from a queen at another position.
bool safe(char x0, char y0, char x1, char y1) {
return x0!=x1 && y0!=y1 && x0-y0!=x1-y1 && x0+y0!=x1+y1;
}
/// Filter safe positions from a list of positions.
Cons *filter(char x0, char y0, Cons *list) {
if (!list) return empty;
if (safe(x0, y0, list->x, list->y))
{
// Copy value types out early so we have fewer reference types.
char x=list->x, y=list->y;
Cons *t = filter(x0, y0, list->t);
// t is not otherwise reachable so we must push it in the event of a GC.
// i.e. cons(t, x, y, t) rather than cons(x, y, t).
return cons(t, x, y, t);
}
else
return filter(x0, y0, list->t);
}
/// Search for solutions to the n-queens problem.
int search(int n, int nqs, Cons *qs, Cons *qps, int i) {
if (!qps) return (nqs==n ? i+1 : i);
Cons **oldShadowStackPtr = shadowStackPtr;
// We don't need to push qps as long as we copy out its components.
Cons ps = *qps;
// qs and ps.t are required by the final call so we must push them.
if (qs) *shadowStackPtr++ = qs;
if (ps.t) *shadowStackPtr++ = ps.t;
Cons *qs2 = cons(ps.x, ps.y, qs);
// qs2 is required after the call to filter so we must push it.
if (qs2) *shadowStackPtr++ = qs2;
Cons *ps2 = filter(ps.x, ps.y, ps.t);
// ps2 isn't used after this call so we don't need to push it.
i = search(n, nqs+1, qs2, ps2, i);
// Restore the shadow stack before the tail call.
shadowStackPtr = oldShadowStackPtr;
return search(n, nqs, qs, ps.t, i);
}
/// Construct a list of all board positions.
Cons *ps(int n) {
Cons *list = empty;
for (int i=1; i<=n; ++i)
for (int j=1; j<=n; ++j)
list = cons(list, i, j, list);
return list;
}
int main() {
memset(allocated, 0, bitVectorSize);
memset(marked, 0, bitVectorSize);
memset(region, 0, 1 << regionSize);
for (int i=0; i<256; ++i)
firstUnsetBit[i] = findFirstUnsetBit(i);
__int64 freq, tStart, tStop;
unsigned long TimeDiff;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
QueryPerformanceCounter((LARGE_INTEGER *)&tStart);
for (int n=1; n<12; ++n) {
cout << search(n, 0, empty, ps(n), 0) << " solutions to the " << n << "-queens problem" << endl;
}
QueryPerformanceCounter((LARGE_INTEGER *)&tStop);
TimeDiff = (unsigned long)(((tStop - tStart) * 1000000) / freq);
cout << "Took " << TimeDiff/1e6 << "s" << endl;
cout << gcCycles << " gc cycles => every " << TimeDiff/gcCycles << "us" << endl;
getc(stdin);
return 0;
}
--
Dr Jon Harrop, Flying Frog Consultancy Ltd.