I wrote a little benchmark that reported that my comparison is
nearly exactly 15 times faster than a bare strcmp() on my Zen2
computer.
#if defined(_WIN32)
#include <Windows.h>
#endif
#include <iostream>
#include <string_view>
#include <cstring>
#include <memory>
#include <chrono>
#include <type_traits>
#include <atomic>
template<typename CharType, typename TraitsType>
requires std::same_as<std::make_unsigned_t<CharType>, unsigned char>
|| std::same_as<CharType, wchar_t>
|| std::same_as<CharType, char8_t>
|| std::same_as<CharType, char16_t>
|| std::same_as<CharType, char32_t>
bool svCmp( CharType const *str, std::basic_string_view<CharType,
TraitsType> const &sv )
{
#if defined(_WIN32)
__try
{
return !str[sv.length()] && memcmp( str, sv.data(), sv.length() *
sizeof(CharType) ) == 0;
}
__except( EXCEPTION_EXECUTE_HANDLER )
{
using sv_cit = std::string_view::const_iterator;
for( sv_cit itSv = sv.cbegin(), itSvEnd = sv.cend(); itSv != itSvEnd;
++itSv, ++str )
if( *str ) [[likely]]
if( *str != *itSv ) [[unlikely]]
return false;
else;
else
return false;
return true;
}
#else
return str == sv;
#endif
}
using namespace std;
using namespace chrono;
atomic<size_t> aSum( 0 );
int main()
{
constexpr size_t N = 0x1000;
string
cStr( N, '*' ),
svStr( cStr );
auto cmpStrcmp = []( char const *str, string_view const &sv ) -> bool {
return strcmp( str, sv.data() ) == 0; };
auto cmpSvCmp = []( char const *str, string_view const &sv ) -> bool {
return svCmp( str, sv ); };
using cmp_sig_t = bool (*)( char const *str, string_view const &sv );
auto bench = [&]( cmp_sig_t cmp ) -> double
{
cmp_sig_t volatile vCmp = cmp;
size_t sum = 0;
(void)svStr.c_str();
auto start = high_resolution_clock::now();
for( size_t r = 1'000'000; r--; )
sum += vCmp( cStr.c_str(), svStr );
double dur = (double)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
::aSum = sum;
return dur;
};
cout << (bench( cmpStrcmp ) / bench( cmpSvCmp )) << endl;
}
Unfortunately memcmpy actually isn't SSE or AVX-optimized with
Visual C++'s runtime as I expected. But nevertheless memcmp()
does its comparisons in 8 byte chunks with a x64-compile.
I'm using a volatile pointer to the comparison function to
prevent any optimizations on my own function to have a fair
comparison against memcmp().