I have got a problem with GCC 4.0.3 -- if class does not have both
explicit dtor and cctor, NRVO does not work for it. I suspect this
problem applies to other GCC versions. This (of course) adversely
affects compiled code performance.
I can not explain why it happens and looking for:
- an explanation how implicit cctor/dtor affect NRVO-related decisions
made by compiler
- some kind of fix/workaround
Thanks!
(Btw, I checked this in MSVC 2005 -- everything is fine there)
Here is test code (uncomment either 1. or 2. or both and first test
will become as fast as second one):
-------- code start -------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
struct TestX
{
/************* TEST **************/
// if you add any of these we get fast:
// 1.
//TestX() {}
//TestX(const TestX& rhs) { memcpy(data, rhs.data, sizeof
(data)); }
// 2.
//~TestX() {}
/************ END TEST **************/
unsigned data[1000];
void run_test()
{
memset(data, 0, sizeof(data));
}
};
class RVO_Test
{
public:
TestX fill(unsigned i)
{
TestX data;
data.run_test();
return data;
}
void fill(unsigned i, TestX &data)
{
data.run_test();
}
};
int main()
{
const unsigned MAX = 500000;
// test #1
{
printf("Running Option 1\n");
clock_t start = clock();
for(int k = 0; k < 10; k++)
{
// slow or fast depending on cc.h TEST changes
unsigned count = 0;
RVO_Test a;
for ( unsigned i = 0; i < MAX; ++i ) {
TestX x = a.fill( MAX );
count += x.data[5];
}
printf("Count: %i\n", count);
}
printf("Time: %f\n", double(clock() - start)/CLOCKS_PER_SEC);
}
// test #2
{
printf("Running Option 2\n");
clock_t start = clock();
for(int k = 0; k < 10; k++)
{
// fast
unsigned count = 0;
RVO_Test a;
for ( unsigned i = 0; i < MAX; ++i ) {
TestX x;
a.fill( MAX, x );
count += x.data[5];
}
printf("Count: %i\n", count);
}
printf("Time: %f\n", double(clock() - start)/CLOCKS_PER_SEC);
}
return 0;
}
-------- code end -------
Sincerely yours,
Michael.
I'll check asm later but from what I see all your functions are
inline. So it is possible there is no NRVO at all.
It could be simple loop optimization (moving out of loop). ie
for(...) char a;
vs
char a;
for(...);
After reviewing asm, good and bad news.
Bad news, memcpy is indeed present. So it is problem.
Good news, gcc 4.1.1 does not have this issue.
So either upgrade or use ur workaround.
Thanks for help, Vlad! (sorry for late post, just returned from
vacations)