Function Pointer from Lambda with Captures

Frederick Virchanza Gotham

unread,

Apr 14, 2023, 3:15:22 PM4/14/23

to

Since C++11, there has been an implicit conversion from a lambda to a
function pointer so long as the lambda has no captures. If the lambda
has captures, the implicit conversion is disabled. However it's easy to
get a function pointer from a lambda-with-captures if we use global
variables or the heap, something like:

std::function<void(void)> f; // global object

void Func(void)
{
f(); // invokes the global object
}

void Some_Library_Func(void (*const pf)(void))
{
pf();
}

int main(int argc, char **argv)
{
auto mylambda = [argc](void) -> void
{
cout << "Hello " << argc << "!" << endl;
};

f = mylambda;

Some_Library_Func(Func);
}

It is possible though to procure a normal function pointer from a
lambda-with-captures without making use of global variables or the heap
-- it can all be kept on the stack.

To invoke a capture lambda, we need two pieces of data:
Datum A: The address of the lambda object
Datum B: The address of the 'operator()' member function

Datum A is a pointer into data memory.
Datum B is a pointer into code memory.

The technique described in this post will only work on CPU's where the
program counter can be set to an address in data memory, and therefore
we will use 'void*' for Datum B rather than 'void(*)(void)'. I'm open
to correction here but I think this technique will work on every
implementation of C++ in existence today, even on microcontrollers such
as the Texas Instruments F28069 and the Arduino Atmel sam3x8e.

We will define a simple POD struct to hold these two pieces of data:

struct LambdaInfo {
void *data, *code;
};

Let's write a function that invokes a capture lambda, passing the 'this'
pointer as the first argument to the member function:

void InvokeLambda(LambdaInfo const *const p)
{
void (*pf)(void*) = (void (*)(void*))p->code;

return pf(p->data);
}

And now let's check what this got compiled to on an x86_64 computer:

mov rdx,QWORD PTR [rdi]
mov rax,QWORD PTR [rdi+0x8]
mov rdi,rdx
jmp rax

What we've got here is four instructions. To see a little more clearly
what's going on here, I'm going to replace the function arguments with
numerical constants:

void InvokeLambda(void)
{
void (*pf)(void*) = (void (*)(void*))0x1122334455667788;

return pf( (void*)0x99aabbccddeeffee );
}

gets compiled to:

movabs rdi,0x99aabbccddeeffee
movabs rax,0x1122334455667788
jmp rax

What we've got here now is three simple instructions. Here's the
assembler alongside the machine code:

movabs rdi,0x99aabbccddeeffee 48 bf ee ff ee dd cc bb aa 99
movabs rax,0x1122334455667788 48 b8 88 77 66 55 44 33 22 11
jmp rax ff e0

What we have here is 22 bytes worth of CPU instructions, which we can
put into a byte array as follows:

char unsigned instructions[22u] = {
0x48, 0xBF,
0xEE, 0xFF, 0xEE, 0xDD, 0xCC, 0xBB, 0xAA, 0x99,
0x48, 0xB8,
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
0xFF, 0xE0,
};

This 22-byte array can be our thunk. I'll write a class to manage the
thunk:

class LambdaThunk {

char unsigned instructions[22u];

void SetData(void const volatile *const p) volatile
{
char unsigned const volatile *const q =
(char unsigned const volatile *)&p;

this->instructions[2] = q[0];
this->instructions[3] = q[1];
this->instructions[4] = q[2];
this->instructions[5] = q[3];
this->instructions[6] = q[4];
this->instructions[7] = q[5];
this->instructions[8] = q[6];
this->instructions[9] = q[7];
}

void SetCode(void const volatile *const p) volatile
{
char unsigned const volatile *const q =
(char unsigned const volatile *)&p;

this->instructions[12] = q[0];
this->instructions[13] = q[1];
this->instructions[14] = q[2];
this->instructions[15] = q[3];
this->instructions[16] = q[4];
this->instructions[17] = q[5];
this->instructions[18] = q[6];
this->instructions[19] = q[7];
}

public:

LambdaThunk(void) // set the opcodes
{
this->instructions[ 0u] = 0x48u; // movabs rdi
this->instructions[ 1u] = 0xBFu;
this->instructions[10u] = 0x48u; // movabs rax
this->instructions[11u] = 0xB8u;
this->instructions[20u] = 0xFFu; // jmp rax
this->instructions[21u] = 0xE0u;
}

template<typename LambdaType>
void AdaptFrom(LambdaType &arg) volatile
{
this->SetData(&arg);
this->SetCode( (void*)&LambdaType::operator() );
// The previous line works fine with GNU g++
}

template<typename LambdaType>
LambdaThunk(LambdaType &arg) : LambdaThunk() // set opcodes
{
this->AdaptFrom<LambdaType>(arg);
}

void (*getfuncptr(void) const volatile)(void)
{
return (void(*)(void))&this->instructions;
}
};

And now let's write some test code to try it out:

#include <iostream> // cout, endl
using std::cout;
using std::endl;

void Some_Library_Func( void (*const pf)(void) )
{
pf();
}

int main(int argc, char **argv)
{
auto mylambda = [argc](void) -> void
{
std::cout << "Hello " << argc << "!" << std::endl;
};

Some_Library_Func( LambdaThunk(mylambda).getfuncptr() );

cout << "Last line in Main" << endl;
}

This works fine, you can see it up on Godbolt here:

https://godbolt.org/z/r84hEsG1G

Things get a little more complicated if the lambda has a return value,
and several parameters. For example if the lambda returns a struct
containing 17 int's, 33 double's, and if the lambda takes 18 parameters,
then the assembler for 'InvokeLambda' is a little more complicated:

struct ReturnType {
int a[17];
double b[33];
void (*c)(int);
std::string d;
};

ReturnType InvokeLambda(int arg1, double arg2, float arg3,
int arg4, double arg5, float arg6,
int arg7, double arg8, float arg9,
int arg10, double arg11, float arg12,
int arg13, double arg14, float arg15,
int arg16, double arg17, float arg18)
{
ReturnType (*pf)(void volatile *,int,double,float,
int,double,float,
int,double,float,
int,double,float,
int,double,float,
int,double,float)
= (ReturnType (*volatile)(void volatile*,
int,double,float,
int,double,float,
int,double,float,
int,double,float,
int,double,float,
int,double,float))0x1122334455667788;

return pf( (void volatile *volatile)0x99aabbccddeeffee,
arg1,arg2,arg3,arg4,arg5,arg6,
arg7,arg8,arg9,arg10,arg11,arg12,
arg13,arg14,arg15,arg16,arg17,arg18);
}

gets compiled to:

push rbx
movss xmm8,DWORD PTR [rsp+0x30]
mov rbx,rdi
sub rsp,0x8
movss DWORD PTR [rsp],xmm8
push QWORD PTR [rsp+0x30]
mov eax,DWORD PTR [rsp+0x30]
push rax
movss xmm8,DWORD PTR [rsp+0x30]
movabs rax,0x1122334455667788
sub rsp,0x8
movss DWORD PTR [rsp],xmm8
push QWORD PTR [rsp+0x30]
push r9
mov r9d,r8d
mov r8d,ecx
mov ecx,edx
mov edx,esi
movabs rsi,0x99aabbccddeeffee
call rax
mov rax,rbx
add rsp,0x30
pop rbx
ret

which is 94 bytes worth of instructions instead of 22. Still manageable.

I'm not suggesting that a lambda-with-captures should convert implicitly
to a function pointer, because then we'd have the issue of the lifetime
of the thunk.

Anyone got any thoughts on this?

David Brown

unread,

Apr 14, 2023, 3:58:23 PM4/14/23

to

On 14/04/2023 21:15, Frederick Virchanza Gotham wrote:
> Since C++11, there has been an implicit conversion from a lambda to a
> function pointer so long as the lambda has no captures. If the lambda
> has captures, the implicit conversion is disabled. However it's easy to
> get a function pointer from a lambda-with-captures if we use global
> variables or the heap, something like:
>

<snip>

>
> Anyone got any thoughts on this?

My first thought is "why?". What are you actually trying to achieve
that is so crucial as to be worth this monstrosity? I know (from
previous threads) that you enjoy figuring out this kind of thing, and
fun and curiosity are perfectly good reasons. But I would like to know
if there are any other reasons or use-cases for this.

Alf P. Steinbach

unread,

Apr 14, 2023, 5:34:41 PM4/14/23

to

On 2023-04-14 9:15 PM, Frederick Virchanza Gotham wrote:
> std::function<void(void)> f; // global object
>
> void Func(void)
> {
> f(); // invokes the global object
> }
>
> void Some_Library_Func(void (*const pf)(void))
> {
> pf();
> }
>
> int main(int argc, char **argv)
> {
> auto mylambda = [argc](void) -> void
> {
> cout << "Hello " << argc << "!" << endl;
> };
>
> f = mylambda;
>
> Some_Library_Func(Func);
> }

That idea works for a more general portable solution.

An assembly language solution, as you presented (snipped), can use less
space, which can be important for embedded system programming, but we're
talking a handful of byte per thunk.

So I just put a generous capacity of 97 recursive calls in this code:

#include <array>
#include <functional>
#include <utility>

#include <type_traits>

#include <assert.h> // assert
#include <stddef.h> // size_t
#include <fmt/core.h> // <url: https://github.com/fmtlib/fmt>, for
output.

namespace my {
using std::array, // <array>
std::function, // <functional>
std::index_sequence, std::make_index_sequence; // <utility>

template< class T > using type_ = T;
template< class T > using ref_ = T&;
template< class T > using in_ = ref_<const T>;

template< class Result, class... Params >
class Thunked_
{
using Func = auto( Params... ) -> Result;
using Wrapped_func = function<Func>;

int m_index;

enum{ capacity = 97 };

static inline Wrapped_func the_wrapped_funcs[capacity];

template< size_t i >
static auto invoke_( Params... params )
-> Result
{ return the_wrapped_funcs[i]( params... ); }

template< size_t... indices >
static auto make_invokers( index_sequence<indices...> )
-> array<Func*, capacity>
{
static_assert( sizeof...( indices ) == capacity );
return { &invoke_<indices>... };
}

using Indices = make_index_sequence<capacity>;
static inline array<Func*, capacity> the_invokers =
make_invokers( Indices() );
static inline int n_in_use = 0;

public:
Thunked_( Wrapped_func f ):
m_index( n_in_use )
{
assert( n_in_use < capacity ); // TODO: exception, not
assertion.

the_wrapped_funcs[m_index] = move( f );
++n_in_use;
}

Thunked_( in_<Thunked_> ) = delete;
auto operator=( in_<Thunked_> ) -> ref_<Thunked_> = delete;

~Thunked_() { --n_in_use; }

auto func_ptr() const -> Func* { return the_invokers[m_index]; }
operator Func*() const { return func_ptr(); }
};
} // namespace my

namespace app {
using std::function; // <functional>

function<void()> global_f;

void call_f()
{
global_f(); // invokes the global object
}

void some_library_func( void pf() )
{
pf();
}

void run()
{
const int x = 42;
const auto my_lambda = [&x]() -> void
{
fmt::print( "Hello {:d}!\n", x );
};

#ifndef USE_GLOBAL
some_library_func( my::Thunked_<void>( my_lambda ) );
#else
// Original example code using global:
global_f = my_lambda;
some_library_func( &call_f );
#endif
}
} // namespace app

auto main() -> int { app::run(); }

People I know who have been interested in C++ thunking:

* The dog person (an English chap) in the C++ Lounge at SO.
* Andrei Alexandrescu, who called them trampoline functions.

Cheers, and happy hacking!, :)

- Alf

Ben Bacarisse

unread,

Apr 14, 2023, 7:27:56 PM4/14/23

to

I took the name Some_Library_Func as a clue. There is an external
interface (probably in C but maybe not) that requires are "plain"
function pointer argument.

--
Ben.

Bonita Montero

unread,

Apr 14, 2023, 9:49:06 PM4/14/23

to

Am 14.04.2023 um 21:15 schrieb Frederick Virchanza Gotham:

> I'm not suggesting that a lambda-with-captures should convert implicitly
> to a function pointer, because then we'd have the issue of the lifetime
> of the thunk.

You can accees thread-local and static variables from a lambda without
captures. That's useful f.e. if you have a callback for a C-function
where you need a usual function pointer. And imagine you've got sth.
like a function-object for std::sort() having no captures makes this
function object more lightweight since you won't have to copy the
function object across the recursive calls of std::sort.

vector<int> vi;
...
thread_local int x;
sort( vi.begin, vi.end(),
[]( int left, int right ) -> bool { left + x < right; } );

Havinv such a lambda-object without captures gives more performance.

Frederick Virchanza Gotham

unread,

Apr 15, 2023, 7:52:00 PM4/15/23

to

On Friday, April 14, 2023, Alf P. Steinbach wrote:
>
> Cheers, and happy hacking!, :)

The very first thunk accommodated a function that returned void
and took no arguments. The machine code of that thunk was 22 bytes.

Now let's write a thunk that will work for _any_ lambda,
irrespective of how many parameters it has or how big the return type
is. On an x86_64 computer, here's how a function call works:

The return address is pushed onto the stack
Int/Ptr arguments 1-6 go in registers RDI, RSI, RDX, RCX, R8, R9
Int/Ptr arguments 7 and above go onto the stack (right to left)
Floating point arguments 1-8 go in registers XMM0 - XMM7
Floating point arguments 9 and above go onto the stack
The callee stores its return value in RAX
If the return value is > 8 bytes, the next 8 bytes go in RDX
If the return value is > 16 bytes, extra bytes go onto the stack
The callee jumps back to the return address

So let's say we have a lambda function whose signature is something like
the following:

bool Func(int a, long b, float c, int *p);

If this lambda function has captures, then there will need to be a
hidden 'this' pointer:

bool Func(void *this, int a, long b, float c, int *p);

The first function signature without the 'this' pointer would be called
as follows:
Store a in RDI
Store b in RSI
Store c in XMM0
Store d in RDX

However the second function signature would be called as follows:
Store the 'this' pointer in RDI
Store a in RSI
Store b in RDX
Store c in XMM0
Store d in RCX

So in effect, when we introduce a hidden 'this' pointer as the first
parameter, all of the other registers get moved down one. So, inside
our thunk, we need to write assembler to do the following:

push R9 onto stack
move R8 -> R9
move RCX -> R8
move RDX -> RCX
move RSI -> RDX
move RDI -> RSI
store the 'this' pointer in RDI
invoke the lambda
pop r9 off the stack
jump back to the return address

So the assembler for the thunk should be something like:

push r9
mov r9,r8
mov r8,rcx
mov rcx,rdx
mov rdx,rsi
mov rsi,rdi
movabs rdi, 0x1122334455667788 // the 'this' pointer
movabs rax, 0x99aabbccddeeff22 // address of lambda code
call rax
add rsp,8 // pop r9 off stack
ret

This new and improved thunk is now 44 bytes. It will work fine for a
function that has any number of parameters, so long as the return
value isn't more than 16 bytes. See it working up on GodBolt:

https://godbolt.org/z/xTfEYo5jE

So next what we need to do is try to get it to work with a function that
returns a value greater than 16 bytes. The thunk we've written so far
is problematic in this situation because:
Step 1) We push r9 onto the stack
Step 2) We invoke the lambda function
Step 3) We pop r9 off the stack

Step 3 will cause a problem because instead of popping r9 off the stack,
it will pop the extra bytes of the return value off the stack. In order
to get around this, here's what we need to do:
Step 1) Before invoking the lambda, save the stack pointer
Step 2) After invoking the lambda, check if stack pointer has changed
Step 3) If the stack pointer is changed, do a 'memmove' to move
everything on the stack upwards by 8 bytes

Our 'memmove' operation would look like:

void move_stack(char *oldval, char *const newval)
{
char *p = oldval,
*q = oldval - 8u;

while ( oldval-- > newval )
{
*p-- = *q--;
}
}

If we compile that to assembler and use r15,rsp instead of rdi,rsi, here
is what it looks like:

start:
cmp rsp,r15
jae end
movzx r13d,BYTE PTR [r15-0x8]
sub r15,0x1
mov BYTE PTR [r15+0x1],r13b
jmp start
end:

So now let's try to append this onto the end of the thunk we've
already written:

push r9
mov r9,r8
mov r8,rcx
mov rcx,rdx
mov rdx,rsi
mov rsi,rdi
movabs rdi, 0x1122334455667788 (the 'this' pointer)
movabs rax, 0x99aabbccddeeff22 (address of lambda code)
mov r15, rsp
call rax
start_of_loop:
cmp rsp,r15
jae out_of_loop
movzx r13d,BYTE PTR [r15-0x8]
sub r15,0x1
mov BYTE PTR [r15+0x1],r13b
jmp start_of_loop
out_of_loop:
add rsp,8 ; move stack pointer back 8 bytes
ret

This latest thunk comes to 67 bytes. Here is my attempt to
implement it on GodBolt, but it's not working yet:

https://godbolt.org/z/nzWvE7q34

I haven't got it working yet but I'm very very close. I realise we'll
have to figure out how to restore the values of r15 and r13 but
sure we'll worry about that after we get it working.

Can anyone spot what's wrong with my code?

Frederick Virchanza Gotham

unread,

Apr 16, 2023, 9:23:02 AM4/16/23

to

On Sunday, April 16, 2023, I wrote:
>
> The callee stores its return value in RAX
> If the return value is > 8 bytes, the next 8 bytes go in RDX
> If the return value is > 16 bytes, extra bytes go onto the stack

It looks like it isn't actually this simple. First let's see a function
that returns a 128-Bit value:

struct ReturnTypeB {
long long unsigned a,b;
};

ReturnTypeB FuncB(void)
{
return { 0x1111111111111111, 0x2222222222222222 };
}

This gets compiled to:

mov rax,0x1111111111111111
mov rdx,0x2222222222222222
ret

This is exactly what I expected. But let's now make it return 24 bytes:

struct ReturnTypeA {
long long unsigned a,b,c;
};

ReturnTypeA FuncA(void)
{
return { 0x1111111111111111, 0x2222222222222222,
0x3333333333333333 };
}

This........ rather unexpectedly...... gets compiled to:

mov rax,rdi
mov rdx,0x1111111111111111
mov QWORD [rdi],rdx
mov rcx,0x2222222222222222
mov QWORD [rdi+0x8],rcx
mov rsi,0x3333333333333333
mov QWORD [rdi+0x10],rsi
ret

At first glance, it looks like this function is receiving a pointer in
RDI, and that it's storing the three 64-Bit numbers sequentially at the
address specified by RDI. To be totally sure about this, I took this
machine code and ran it through a decompiler, which gave me:

uint64_t *FuncA(uint64_t *const p)
{
p[0] = 0x1111111111111111u;
p[1] = 0x2222222222222222u;
p[2] = 0x3333333333333333u;
return p;
}

What I've learned here I think can be summed up as follows:
(a) If the return value <= 8 bytes, it goes in RAX
(b) If the return value <= 16 bytes, the first 8 go in RAX and the
second 8 go in RDX
(c) If the return value >= 16 bytes, RAX and RDX are used differently.
There is a hidden first parameter in RDI to the function which acts
as a pointer indicating where the return value is to be stored. This
hidden pointer is returned from the function in RAX. RDX isn't used.

So now let's see what happens if I turn this into a member function,
because I don't know if RDI will be used for the 'this' pointer or for
the address to store the return value. Let's see, I wrote the
following:

void *volatile global;

struct ReturnTypeA {
long long unsigned a,b,c;
};

struct MyClass {
ReturnTypeA FuncA(void)
{
global = this;
return { 0x1111111111111111, 0x2222222222222222,
0x3333333333333333 };
}
};

And compiled it to the following assembler:

// Next two lines set up the stack frame
push rbp
mov rbp,rsp
// The next line sets return value = first argument
mov rax,rdi
// The next line 3 lines appear to get the 'this'
// pointer from RSI and then store it in 'global'
mov QWORD PTR [rbp-0x8],rsi
mov rcx,QWORD PTR [rbp-0x8]
mov QWORD PTR [rip+0x0],rcx # R_X86_64_PC32 global-0x4
// The next 2 lines store a 64-Bit number in rdi+0
mov rcx,0x1111111111111111
mov QWORD PTR [rdi],rcx
// The next 2 lines store a 64-Bit number in rdi+8
mov rcx,0x2222222222222222
mov QWORD PTR [rdi+0x8],rcx
// The next 2 lines store a 64-Bit number in rdi+16
mov rcx,0x3333333333333333
mov QWORD PTR [rdi+0x10],rcx
// The next 2 lines restore the frame pointer and return
pop rbp
ret

So I think I've solved the mystery here. If the return type is more than
16 bytes in size, the hidden 'this' pointer is put inside RSI rather
than RDI, because RDI is used for the address of where the return value
should be stored.

So let's go back and take another look at our original thunk code for
moving all the registers down:

push r9
mov r9,r8
mov r8,rcx
mov rcx,rdx
mov rdx,rsi
mov rsi,rdi

mov rdi, 0x1122334455667788 (the 'this' pointer)
mov rax, 0x99aabbccddeeff22 (address of lambda code)

Let's try to edit this code to put the 'this' pointer in RSI instead of
RDI, maybe something like:

push r9
mov r9,r8
mov r8,rcx
mov rcx,rdx
mov rdx,rsi

// REMOVE THIS LINE : mov rsi,rdi
mov rsi, 0x1122334455667788 (the 'this' pointer)
mov rax, 0x99aabbccddeeff22 (address of lambda code)

Ok this might be the correct thunk code, and here it is up on GodBolt:

https://godbolt.org/z/65YEsaT8o

It works. Ok so now there's only one thing left to do. We need to
have two separate thunk templates, Thunk Type A and Thunk Type B.
Thunk Type A = for any function whose return type <= 16 bytes
Thunk Type B = for any function whose return type > 16 bytes

At compile-time, we can try to make this distinction by using 'sizeof'
on the lambda's return type.... which is what I'm going to try do to
now, see up on GodBolt:

https://godbolt.org/z/MzYGxfz9Y

Take a look yourself.

Frederick Virchanza Gotham

unread,

Apr 16, 2023, 11:16:08 AM4/16/23

to

On Sunday, April 16, 2023 at 2:23:02 PM UTC+1, Frederick Virchanza Gotham wrote:
>
> https://godbolt.org/z/MzYGxfz9Y
>
> Take a look yourself.

And finally, if the implementation I've described so far isn't available yet on your target architecture, here's a less-efficient implementation that will 'just work' on any system:

https://godbolt.org/z/sG1rTbWE6

And here it is copy-pasted:

#include <cassert> // assert
#include <cstddef> // size_t
#include <utility> // forward

template<typename LambdaType>
class LambdaThunk {
protected:

static thread_local LambdaType *p_lambda_object;

template <typename ReturnType, typename... Params>
static ReturnType Actual_Thunk(Params... args)
{
assert( nullptr != p_lambda_object );
return (*p_lambda_object)(std::forward<Params>(args)...);
}

template <typename ReturnType, typename... Params>
static ReturnType (*Get_Thunk_Address(ReturnType (LambdaType::*)(Params...) const))(Params...)
{
return Actual_Thunk<ReturnType,Params...>;
}

public:

LambdaThunk(LambdaType &obj)
{
p_lambda_object = &obj;
}

auto thunk(void) const volatile // yes this could be a static function
{
return Get_Thunk_Address(&LambdaType::operator());
}
};

template<typename LambdaType>
thread_local LambdaType *LambdaThunk<LambdaType>::p_lambda_object = nullptr;

int Some_Library_Func(int (*const pf)(char const*))
{
return pf("monkey");
}

#include <iostream>

using std::cout;
using std::endl;

int main(int argc, char **argv)
{

auto mylambda = [argc](char const *const p) -> int
{
cout << "Hello " << argc << " " << p << "!" << endl;
return 77;
};

int const z = Some_Library_Func( LambdaThunk(mylambda).thunk() );

cout << "z = " << z << endl;
}

I think the only place where this inefficient implementation could fall
down is if you have a lambda defined inside a recursive function... but
you'd just need to make sure that the thunk is invoked before the
function is re-entered (unless of course, upon re-entry, you account
for the thunk not having been invoked yet).

Frederick Virchanza Gotham

unread,

Apr 16, 2023, 6:02:36 PM4/16/23

to

On Friday, April 14, 2023, Alf P. Steinbach wrote:
>

> That idea works for a more general portable solution.
>
> An assembly language solution, as you presented (snipped), can use less
> space, which can be important for embedded system programming, but we're
> talking a handful of byte per thunk.
>
> So I just put a generous capacity of 97 recursive calls in this code:

I was having so much fun writing my own solution that I didn't pay enough attention to yours.

I've edited your code so that the following syntax is possible:

some_library_func( my::Thunked_(my_lambda) );

Note that you don't have to give it the function signature, it figures it out automatically.
I did a few other little tweaks like making 'n_in_use' atomic.

Here's what I've done with your code:

https://godbolt.org/z/G56xbPMWf

What do you reckon about sharing it to the standard proposals mailing list for consideration to be put in the Standard Library?

There could be an entire family of thunk classes, not just one to remove the hidden 'this' pointer. For example there could be another thunk to swap the first and second argument, turning:

void SomeFunc(float a, int b);

into:

void SomeFunc(int b, float a);

or another thunk to reverse the order of arguments.

Frederick Virchanza Gotham

unread,

Apr 16, 2023, 6:13:27 PM4/16/23

to

On Sunday, April 16, 2023 at 11:02:36 PM UTC+1, Frederick Virchanza Gotham wrote:
>
> I've edited your code so that the following syntax is possible:
>
> some_library_func( my::Thunked_(my_lambda) );
>
> Note that you don't have to give it the function signature, it figures it out automatically.
> I did a few other little tweaks like making 'n_in_use' atomic.
>
> Here's what I've done with your code:
>
> https://godbolt.org/z/G56xbPMWf

Actually I've made two more changes:
* Get rid of std::function and just uses simple pointers
* Inside the invoke_ function, use 'std::forward' on the arguments

Latest rendition:

https://godbolt.org/z/PT1oaoveo

Alf P. Steinbach

unread,

Apr 16, 2023, 8:32:10 PM4/16/23

to

On 2023-04-17 12:02 AM, Frederick Virchanza Gotham wrote:
> On Friday, April 14, 2023, Alf P. Steinbach wrote:
>>
>> That idea works for a more general portable solution.
>>
>> An assembly language solution, as you presented (snipped), can use less
>> space, which can be important for embedded system programming, but we're
>> talking a handful of byte per thunk.
>>
>> So I just put a generous capacity of 97 recursive calls in this code:
>
>
> I was having so much fun writing my own solution that I didn't pay enough attention to yours.
>
> I've edited your code so that the following syntax is possible:
>
> some_library_func( my::Thunked_(my_lambda) );

Just a thought, but does that play well with a lambda like

[&]( auto arg ) { cout << arg << endl; }

?

> Note that you don't have to give it the function signature, it figures it out automatically.
> I did a few other little tweaks like making 'n_in_use' atomic.
>
> Here's what I've done with your code:
>
> https://godbolt.org/z/G56xbPMWf
>
> What do you reckon about sharing it to the standard proposals mailing list for consideration to be put in the Standard Library?

A proposal needs

* A working portable implementation.
* Tested with multiple compilers/architectures (preferable).
* Proposal text expressed in standardese.
* Having been /discussed/ on the appropriate mailing list (in the old
days it was instead comp.std.c++), before being submitted as a proposal.
* A champion or two on the committee.

The (SO) Good Puppy's thunks proposal failed on the last point, I
believe: he was unable to get anyone to use their time on it, as I
remember it even though he appeared in person at a meeting.

Anyway, for a proposal it would need to support thread safety.

And for my standard C++ only approach I reckon that that involves a
least one non-obvious engineering decision of choosing between
thread-local variables OR mutual exclusion for thunk creation OR
requiring declaration of a thunk pool in each thread using thunks.
That's just off the top of my head. I haven't tried it.

For that reason the assembly scheme you posted, even though I didn't
verify it, but assuming that it worked, would perhaps be better.

For with stack allocation one gets automatic thread safety and no
discussion about the design and implementation options for that.

> There could be an entire family of thunk classes, not just one to remove the hidden 'this' pointer. For example there could be another thunk to swap the first and second argument, turning:
>
> void SomeFunc(float a, int b);
>
> into:
>
> void SomeFunc(int b, float a);
>
> or another thunk to reverse the order of arguments.

Hm. Off the cuff,

template< class Func >
auto argreversed( Func& f )
-> auto
{ return []( auto a, auto b ) { return f( b, a ); }; }

Maybe add perfect argument forwarding.

- Alf

Frederick Virchanza Gotham

unread,

Apr 17, 2023, 5:04:11 AM4/17/23

to

On Monday, April 17, 2023 at 1:32:10 AM UTC+1, Alf P. Steinbach wrote:
>
> And for my standard C++ only approach I reckon that that involves a
> least one non-obvious engineering decision of choosing between
> thread-local variables OR mutual exclusion for thunk creation OR
> requiring declaration of a thunk pool in each thread using thunks.
> That's just off the top of my head. I haven't tried it.

Over on the mailing list for the Standard, Edward Catmur posted similar code to yours except he used a mutex.

I took Edward's code and edited it similarly to how I edited your own. Here is how I replied:

I had been working with similar code posted by Alf P. Steinbach over on comp.lang.c++ last week, but I much prefer your code Edward.

I have tweaked your code a bit so that the following syntax is possible:

SomeLibraryFunc( thunk(f) );

Note that you don't have to explicitly tell it the lambda's function signature, it figures it out itself.

I've also done a handful of other things:
* Replaced the pair of pointers with just one pointer
* Got rid of the 'used' array and instead just check for nullptr's in the 'data' array
* Mark all members of 'thunk' as 'noexcept'
* Propagate the 'noexcept' from the lambda to the thunk
* Accommodate lambdas marked as 'mutable'

Here's what I've got:

https://godbolt.org/z/5xT7e84WY

Frederick Virchanza Gotham

unread,

Apr 18, 2023, 5:24:05 AM4/18/23

to

On Monday, April 17, 2023 at 10:04:11 AM UTC+1, Frederick Virchanza Gotham wrote:

> Over on the mailing list for the Standard, Edward Catmur posted similar code to yours except he used a mutex.

So far there have been two implementations of generating a thunk for a lambda:
(a) Write the thunk as machine code onto the stack, and execute the stack
(b) Have a global pool of instantiations of a template function, and manage a global array of pointers associated 1:1 with the instantiations of the template function

The drawback of A is that the stack must be executable.

The drawback of B is that if you want to ensure 36 levels of recursion, the executable file will contain 36 thunks, and that you need to lock and unlock a mutex.

Today I've come up with a third solution, which only has one thunk in the final executable file, and which can handle as many threads and as much recursion as your heap allows.

What I've done is keep a global map of frame pointers to lambda objects. I got it working:

https://godbolt.org/z/1WzGq7cj9

And here it is copy-pasted:

#include <cassert> // assert
#include <cstddef> // size_t

#include <cstdlib> // abort (if mutex fails to lock)
#include <mutex> // mutex, lock_guard
#include <utility> // index_sequence, make_index_sequence, forward, declval
#include <map> // map
#include <iterator> // prev, next

namespace std {
extern void *frame_pointer(void);
}

// The next 5 lines are an x86_64 assembler implementation of std::frame_pointer
__asm__(
"_ZSt13frame_pointerv:\n" // mangled name of std::frame_pointer
" mov %rbp, %rax\n"
" ret\n"
);

// The stack grows negatively on the x86_64, so we use the following
// function to compare two frame pointers:
bool IsFurtherFromTop(void const *const p, void const *const q)
{
// returns true if 'q' is closer to the top of the stack
return p > q;
}

// The next three templates: 'ToFuncPtr', 'ToReturnType', 'IsNoExcept'
// are just helpers to make this all possible. You can scroll past them
// to Line #78

namespace detail {

// The following template turns a member function pointer into a
// normal function pointer, but we only want it to use decltype on it
template<typename ReturnType, class ClassType, bool b_noexcept, typename... Params>
ReturnType (*ToFuncPtr(ReturnType (ClassType::*)(Params...) const noexcept(b_noexcept)))(Params...) noexcept(b_noexcept)
{
return nullptr; // suppress compiler warning
}
// and also the non-const version for non-const member functions (or mutable lambdas):
template<typename ReturnType, class ClassType, bool b_noexcept, typename... Params>
ReturnType (*ToFuncPtr(ReturnType (ClassType::*)(Params...) noexcept(b_noexcept)))(Params...) noexcept(b_noexcept)
{
return nullptr; // suppress compiler warning
}

// The following template isolates the return type of a member function pointer,
// but we only want it to use decltype on it. I tried using 'std::result_of_t'
// instead but I get a compiler error for the lambda being an incomplete type
template <typename ReturnType, class ClassType, typename... Params>
ReturnType ToReturnType(ReturnType (ClassType::*)(Params...) const)
{
return std::declval<ReturnType>(); // suppress compiler warning
}
// and also the non-const version for non-const member functions (or mutable lambdas):
template <typename ReturnType, class ClassType, typename... Params>
ReturnType ToReturnType(ReturnType (ClassType::*)(Params...))
{
return std::declval<ReturnType>(); // suppress compiler warning
}

// The following template determines whether a non-static member function is noexcept
template<typename ReturnType, class ClassType, bool b_noexcept, typename... Params>
consteval bool IsNoExcept(ReturnType (ClassType::*)(Params...) const noexcept(b_noexcept))
{
return b_noexcept;
}
// and also the non-const version for non-const member functions (or mutable lambdas):
template<typename ReturnType, class ClassType, bool b_noexcept, typename... Params>
consteval bool IsNoExcept(ReturnType (ClassType::*)(Params...) noexcept(b_noexcept))
{
return b_noexcept;
}

} // close namespace 'detail'

template<typename LambdaType>
class thunk {
protected:
using size_t = std::size_t;
using R = decltype(detail::ToReturnType(&LambdaType::operator()));
using FuncPtr = decltype(detail::ToFuncPtr (&LambdaType::operator())); // preserves the 'noexcept'

// In the map on the next line, we keep track of which frame pointers
// are associated with which lambda objects
// map.begin()->first == frame pointer
// map.begin()->second == address of lambda object
inline static thread_local std::map<void*,LambdaType*> frame_pointers_for_lambdas;

public:
explicit thunk(LambdaType &arg) noexcept {
try {
assert( nullptr == frame_pointers_for_lambdas[ std::frame_pointer() ] );
frame_pointers_for_lambdas[ std::frame_pointer() ] = &arg;
}
catch(...) {
assert(nullptr == "failed to add frame pointer to map");
std::abort(); // ifdef NDEBUG
}
}
~thunk() noexcept {
assert( nullptr != frame_pointers_for_lambdas[ std::frame_pointer() ] );
frame_pointers_for_lambdas.erase(std::frame_pointer());
}
FuncPtr get() const noexcept {
return &invoke;
}
operator FuncPtr() const noexcept { return this->get(); }

protected:

template<typename... A> static R invoke(A... a) noexcept(detail::IsNoExcept(&LambdaType::operator()))
{
assert( frame_pointers_for_lambdas.size() >= 1u );

void *const fp = std::frame_pointer();

assert( false == IsFurtherFromTop(fp, frame_pointers_for_lambdas.begin()->first) );

size_t i;
for ( i = 1u; i < frame_pointers_for_lambdas.size(); ++i )
{
if ( IsFurtherFromTop(fp, std::next(frame_pointers_for_lambdas.begin(),i)->first) )
{
break;
}
}

LambdaType &mylambda = *std::next(frame_pointers_for_lambdas.begin(),i - 1u)->second;
return mylambda(std::forward<A>(a)...);
}

thunk(void) = delete;
thunk(thunk const & ) = delete;
thunk(thunk &&) = delete;
thunk &operator=(thunk const & ) = delete;
thunk &operator=(thunk &&) = delete;
thunk const *operator&(void) const = delete; // just to avoid confusion
thunk *operator&(void) = delete; // just to avoid confusion
};

// ========================================================
// And now the test code.............
// ========================================================

#include <cstring> // strcpy, strcat
#include <cstdio> // sprintf
#include <thread> // jthread
#include <stop_token> // stop_token

#include <iostream> // cout, endl

using std::cout, std::endl;

bool SomeLibraryFunc( bool (*arg)(int), int val )
{
return arg(val);
}

void RecursiveFunc(int arg)
{
if ( arg < 0 ) return;

thread_local char str_buf[128u]; // so we don't need a mutex for cout

auto f = [arg](int const i) -> bool
{
std::sprintf(str_buf, "Hello %i thunk\n",i);
cout << str_buf;
return i < 2;
};

bool const retval = SomeLibraryFunc( thunk(f), arg );

std::strcpy(str_buf, "Returned: ");
std::strcat(str_buf, retval ? "true\n" : "false\n");
cout << str_buf;

RecursiveFunc(--arg);
}

void ThreadEntryPoint(std::stop_token,int const arg)
{
RecursiveFunc(arg);
}

void Spawn_All_Thread_And_Join(int arg)
{
std::jthread mythreads[4u];

for ( auto &t : mythreads )
{
t = std::jthread(ThreadEntryPoint,arg += 7);

}
}

int main(int argc, char **argv)
{

while ( argc++ < 3 ) Spawn_All_Thread_And_Join(argc);
}