Hi Everyone,
I want to check if there's any opposition to SSE4.1 and SSE4.2 feature detection. We can use it for modern algorithms on occasion, like BLAKE2.
I don't plan on hand-coded SSE4.2 ASM, so I'm limiting it to intrinsics at this point. We can put it to immediate use with the upcoming BLAKE2.
If there are no objections, then the change is below. It applies to Visual Studio 2008 and GCC 4.3 and above. We might need to add some specialized tests for Clang and ICC.
Jeff
$ cat sse4.diff
diff --git a/config.h b/config.h
index ae9dd79..5d35c52 100644
--- a/config.h
+++ b/config.h
@@ -416,6 +416,14 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif
+// Intrinsics availible in GCC 4.3 (
http://gcc.gnu.org/gcc-4.3/changes.html) and
+// MSVC 2008 (
http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
+#if !defined(CRYPTOPP_DISABLE_SSE4) && ((_MSC_VER >= 1500) || (CRYPTOPP_GCC_VERSION >= 40300) || defined(__SSE4_1__) || defined(__SSE4_2__))
+ #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1
+#else
+ #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 0
+#endif
+
#if !defined(CRYPTOPP_DISABLE_SSSE3) && !defined(CRYPTOPP_DISABLE_AESNI) && CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || defined(__AES__))
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
#else
diff --git a/config.recommend b/config.recommend
index 343d41b..491b4cb 100644
--- a/config.recommend
+++ b/config.recommend
@@ -416,6 +416,14 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif
+// Intrinsics availible in GCC 4.3 (
http://gcc.gnu.org/gcc-4.3/changes.html) and
+// MSVC 2008 (
http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
+#if !defined(CRYPTOPP_DISABLE_SSE4) && (_MSC_VER >= 1500 || (CRYPTOPP_GCC_VERSION >= 40300) || defined(__SSE4_1__) || defined(__SSE4_2__))
+ #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1
+#else
+ #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 0
+#endif
+
#if !defined(CRYPTOPP_DISABLE_SSSE3) && !defined(CRYPTOPP_DISABLE_AESNI) && CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || defined(__AES__))
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
#else
diff --git a/cpu.cpp b/cpu.cpp
index 2b04132..313d3e3 100644
--- a/cpu.cpp
+++ b/cpu.cpp
@@ -165,7 +165,7 @@ static bool TrySSE2()
}
bool g_x86DetectionDone = false;
-bool g_hasMMX = false, g_hasISSE = false, g_hasSSE2 = false, g_hasSSSE3 = false, g_hasAESNI = false, g_hasCLMUL = false, g_isP4 = false, g_hasRDRAND = false, g_hasRDSEED = false;
+bool g_hasMMX = false, g_hasISSE = false, g_hasSSE2 = false, g_hasSSSE3 = false, g_hasSSE4 = false, g_hasAESNI = false, g_hasCLMUL = false, g_isP4 = false, g_hasRDRAND = false, g_hasRDSEED = false;
word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
// MacPorts/GCC does not provide constructor(priority). Apple/GCC and Fink/GCC do provide it.
@@ -206,6 +206,7 @@ void DetectX86Features()
if ((cpuid1[3] & (1 << 26)) != 0)
g_hasSSE2 = TrySSE2();
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
+ g_hasSSE4 = g_hasSSE2 && ((cpuid1[2] & (1<<19)) || (cpuid1[2] & (1<<20)));
g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
diff --git a/cpu.h b/cpu.h
index 36d26dd..ff3e39d 100644
--- a/cpu.h
+++ b/cpu.h
@@ -118,6 +118,7 @@ extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasISSE;
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasSSSE3;
+extern CRYPTOPP_DLL bool g_hasSSE4;
extern CRYPTOPP_DLL bool g_hasAESNI;
extern CRYPTOPP_DLL bool g_hasCLMUL;
extern CRYPTOPP_DLL bool g_isP4;
@@ -168,6 +169,13 @@ inline bool HasSSSE3()
return g_hasSSSE3;
}
+inline bool HasSSE4()
+{
+ if (!g_x86DetectionDone)
+ DetectX86Features();
+ return g_hasSSE4;
+}
+
inline bool HasAESNI()
{
if (!g_x86DetectionDone)