Adds runtime benchmark to guess phys. core count.

Adds a small runtime benchmark to prevent performance degradation by overprovisioning SMT CPUs with too many threads. This is a temporary workaround until a hardware and OS independent detection of the physical core count through Botan::CPUID is in place.
author: Matthias Gierlings <[email protected]> 2017-11-21 19:34:41 +0100
committer: Matthias Gierlings <[email protected]> 2017-11-23 00:19:31 +0100
commit: 0e28426ca870e1e560b0f89baad92071f6813c4e (patch)
tree: 24c1cef78e5f1235f52d591e03e2b615c736963c /src/lib
parent: f0af55db4e0f3b4424a56f36e2d1885445ce9535 (diff)
3 files changed, 122 insertions, 3 deletions
diff --git a/src/lib/pubkey/xmss/xmss_privatekey.cpp b/src/lib/pubkey/xmss/xmss_privatekey.cpp
index 9ac89a571..8cfab7f75 100644
--- a/src/lib/pubkey/xmss/xmss_privatekey.cpp
+++ b/src/lib/pubkey/xmss/xmss_privatekey.cpp
@@ -100,7 +100,7 @@ XMSS_PrivateKey::tree_hash(size_t start_idx,
       {
       target_node_height,
       static_cast<size_t>(
-         std::ceil(std::log2(std::thread::hardware_concurrency())))
+         std::ceil(std::log2(XMSS_Tools::max_threads())))
       });
 
    // skip parallelization overhead for leaf nodes.
@@ -171,7 +171,7 @@ XMSS_PrivateKey::tree_hash(size_t start_idx,
          node_addresses[i].set_tree_index(
             (node_addresses[2 * i + 1].get_tree_index() - 1) >> 1);
          using rnd_tree_hash_fn_t =
-            void (XMSS_Common_Ops::*)(secure_vector<uint8_t>&,
+            void (XMSS_PrivateKey::*)(secure_vector<uint8_t>&,
                                       const secure_vector<uint8_t>&,
                                       const secure_vector<uint8_t>&,
                                       XMSS_Address& adrs,
@@ -181,7 +181,7 @@ XMSS_PrivateKey::tree_hash(size_t start_idx,
          threads.emplace_back(
             std::thread(
                static_cast<rnd_tree_hash_fn_t>(
-                  &XMSS_Common_Ops::randomize_tree_hash),
+                  &XMSS_PrivateKey::randomize_tree_hash),
                this,
                std::ref(nodes[i]),
                std::ref(ro_nodes[2 * i]),
diff --git a/src/lib/pubkey/xmss/xmss_tools.cpp b/src/lib/pubkey/xmss/xmss_tools.cpp
new file mode 100644
index 000000000..24553b144
--- /dev/null
+++ b/src/lib/pubkey/xmss/xmss_tools.cpp
@@ -0,0 +1,79 @@
+/*
+ * XMSS Tools
+ * (C) 2017 Matthias Gierlings
+ *
+ * Botan is released under the Simplified BSD License (see license.txt)
+ **/
+
+#include <botan/xmss_tools.h>
+
+namespace Botan {
+
+#if defined(BOTAN_TARGET_OS_HAS_THREADS)
+
+size_t XMSS_Tools::max_threads()
+   {
+   static const size_t threads { bench_threads() };
+   return threads;
+   }
+
+size_t XMSS_Tools::bench_threads()
+   {
+   if(std::thread::hardware_concurrency() <= 1)
+      {
+      return 1;
+      }
+   const size_t BENCH_ITERATIONS = 1000;
+   std::vector<std::thread> threads;
+   threads.reserve(std::thread::hardware_concurrency());
+   std::vector<std::chrono::nanoseconds> durations;
+
+   std::vector<size_t> concurrency { std::thread::hardware_concurrency(),
+                                     std::thread::hardware_concurrency() / 2 };
+
+   for(const auto& cc : concurrency)
+      {
+      AutoSeeded_RNG rng;
+      std::vector<XMSS_Hash> hash(std::thread::hardware_concurrency(),
+                                  XMSS_Hash("SHA-256"));
+      std::vector<secure_vector<uint8_t>> data(
+          std::thread::hardware_concurrency(),
+          rng.random_vec(hash[0].output_length()));
+      auto start = std::chrono::high_resolution_clock::now();
+      for(size_t i = 0; i < cc; ++i)
+         {
+         threads.emplace_back(
+            std::thread([&i, &cc, &hs = hash[i], &d = data[i]]()
+               {
+               for(size_t n = 0;
+                   n < BENCH_ITERATIONS * (std::thread::hardware_concurrency() /
+                                           cc);
+                   n++)
+                  {
+                  hs.h(d, d, d);
+                  }
+               }
+            ));
+         }
+      durations.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now() - start));
+      for(auto& t : threads)
+         {
+         t.join();
+         }
+      threads.clear();
+      }
+
+      if(durations[0].count() < durations[1].count())
+         {
+         return concurrency[0];
+         }
+      else
+         {
+         return concurrency[1];
+         }
+  }
+
+#endif
+
+}
+
diff --git a/src/lib/pubkey/xmss/xmss_tools.h b/src/lib/pubkey/xmss/xmss_tools.h
index bbd31fd9f..6e45e882d 100644
--- a/src/lib/pubkey/xmss/xmss_tools.h
+++ b/src/lib/pubkey/xmss/xmss_tools.h
@@ -12,6 +12,12 @@
 #include <botan/secmem.h>
 #include <iterator>
 #include <type_traits>
+#if defined(BOTAN_TARGET_OS_HAS_THREADS)
+   #include <thread>
+   #include <chrono>
+   #include <botan/xmss_hash.h>
+   #include <botan/auto_rng.h>
+#endif
 
 namespace Botan {
 
@@ -53,8 +59,42 @@ class XMSS_Tools final
                 void>::type>
       static void concat(secure_vector<uint8_t>& target, const T& src, size_t len);
 
+      /**
+       * @deprecated Determines the maximum number of threads to be used
+       * efficiently, based on runtime timining measurements. Ideally the
+       * result will correspond to the physical number of cores. On systems
+       * supporting simultaneous multi threading (SMT)
+       * std::thread::hardware_concurrency() usually reports a supported
+       * number of threads which is bigger (typically by a factor of 2) than
+       * the number of physical cores available. Using more threads than
+       * physically available cores for computationally intesive tasks
+       * resulted in slowdowns compared to using a number of threads equal to
+       * the number of physical cores on test systems. This function is a
+       * temporary workaround to prevent performance degradation due to
+       * overstressing the CPU with too many threads.
+       *
+       * @return Presumed number of physical cores based on timing measurements.
+       **/
+      static size_t max_threads(); // TODO: Remove max_threads() and use
+                                   // Botan::CPUID once proper plattform
+                                   // independent detection of physical cores is
+                                   // available.
+
    private:
       XMSS_Tools();
+      /**
+       * @deprecated Measures the time t1 it takes to calculate hashes using
+       * std::thread::hardware_concurrency() many threads and the time t2
+       * calculating the same number of hashes using
+       * std::thread::hardware_concurrency() / 2 threads.
+       *
+       * @return std::thread::hardware_concurrency() if t1 < t2
+       *         std::thread::hardware_concurrency() / 2 otherwise.
+       **/
+      static size_t bench_threads(); // TODO: Remove bench_threads() and use
+                                     // Botan::CPUID once proper plattform
+                                     // independent detection of physical cores
+                                     // is //available.
    };
 
 template <typename T, typename U>
author	Matthias Gierlings <[email protected]>	2017-11-21 19:34:41 +0100
committer	Matthias Gierlings <[email protected]>	2017-11-23 00:19:31 +0100
commit	0e28426ca870e1e560b0f89baad92071f6813c4e (patch)
tree	24c1cef78e5f1235f52d591e03e2b615c736963c /src/lib
parent	f0af55db4e0f3b4424a56f36e2d1885445ce9535 (diff)