Bonita Montero
unread,Nov 29, 2021, 3:26:35 PM11/29/21You do not have permission to delete messages in this group
Sign in to report message
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to
Why is sorting a pointer-array to strings slower than sorting the
strings themselfes ? I've checked that with that code:
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <algorithm>
#include <random>
#include <concepts>
#include <chrono>
#include <execution>
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
using vc_t = vector<char>;
using vc_it = vc_t::iterator;
vector<char> block;
{
ifstream ifs;
ifs.exceptions( ios_base::badbit );
ifs.open( argv[1], ios_base::binary );
ifs.seekg( 0, ios_base::end );
streampos size = ifs.tellg();
if( size > (size_t)-1 || !size )
return EXIT_FAILURE;
ifs.seekg( 0, ios_base::beg );
block.resize( (size_t)size );
ifs.read( &*block.begin(), (streamsize)size );
}
cout << "read" << endl;
auto parse = [&]<typename Callback>( Callback callback ) -> size_t
requires
requires( Callback callback, char const *str, size_t len )
{
{ callback( str, len ) };
}
{
size_t n = 0;
for( vc_it cit = block.begin(); cit != block.end(); )
{
vc_it lineBegin = cit;
for( ; cit != block.end() && (unsigned char)*cit > ' '; ++cit );
if( cit != lineBegin )
++n, callback( &*lineBegin, cit - lineBegin );
for( ; cit != block.end() && (unsigned char)*cit <= ' '; ++cit );
}
return n;
};
vector<string> strings;
size_t sumLen = 0;
strings.reserve( parse( [&]( char const *, size_t len ) { sumLen +=
len; } ) );
if( !strings.capacity() )
return EXIT_FAILURE;
cout << "counted, avg length: " << (double)(ptrdiff_t)sumLen /
(ptrdiff_t)strings.capacity() << endl;
parse( [&]( char const *str, size_t len ) { strings.emplace_back( str,
len ); } );
cout << "parsed" << endl;
sort( strings.begin(), strings.end() );
strings.erase( unique( strings.begin(), strings.end() ), strings.end() );
cout << "uniqued" << endl;
strings.shrink_to_fit();
block.resize( 0 ), block.reserve( 0 );
auto randomize = []<typename RandomIt>( RandomIt begin, size_t n )
requires random_access_iterator<RandomIt>
{
mt19937_64 mt;
uniform_int_distribution<size_t> uidIdx( 0, n - 1 );
for( size_t i = 0, j; i != n; ++i )
{
while( (j = uidIdx( mt )) == i );
swap( begin[i], begin[j] );
}
};
randomize( strings.begin(), strings.size() );
auto start = high_resolution_clock::now();
sort( strings.begin(), strings.end() );
double ns = (double)(int64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / (ptrdiff_t)strings.size();
cout << "string-sort: " << ns << endl;
vector<string *> pStrings;
pStrings.reserve( strings.size() );
for( string &str : strings )
pStrings.emplace_back( &str );
randomize( pStrings.begin(), pStrings.size() );
start = high_resolution_clock::now();
sort( pStrings.begin(), pStrings.end(),
[]( string *left, string *right ) { return *left < *right; } );
ns = (double)(int64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / (ptrdiff_t)strings.size();
cout << "pString-sort: " << ns << endl;
}
I've read 1.4 million strings with an average lengh of 9 characters,
so most strings should fall under the short string optimization and
swapping the string-objects would have a higher cost than swapping
two pointers or a pointer and a length. And sorting string-pointers
is actually nearly twice times slower than sorting the string-objects.
Why is that ?