From 185d85338562627aa4800436a3fe6efa11886351 Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Wed, 28 Oct 2009 19:50:06 +0000
Subject: Add an AltiVec SIMD_32 implementation. Tested and works for Serpent
 and XTEA on a PowerPC 970 running Gentoo with GCC 4.3.4

Uses a GCC syntax for creating literal values instead of the Motorola
syntax [{1,2,3,4} instead of (1,2,3,4)].

In tests so far, this is much, much slower than either the standard scalar code,
or using the SIMD-in-scalar-registers code. It looks like for whatever reason
GCC is refusing to inline the function:
      SIMD_Altivec(__vector unsigned int input) { reg = input; }
and calls it with a branch hundreds of times in each function. I don't know
if this is the entire reason it's slower, but it definitely can't be helping.

The code handles unaligned loads OK but assumes stores are to an aligned address.
This will fail drastically some day, and needs to be fixed to either use scalar
stores, which (most?) PPCs will handle (if slowly), or batch the loads and
stores so we can work across the loads. Considering the code so far loads 4
vectors of data in one go this would probably be a big win (and also for loads,
since instead of doing 8 loads for 4 registers only 5 are needed).
---
 src/utils/simd_32/simd_altivec.h | 178 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 src/utils/simd_32/simd_altivec.h

(limited to 'src/utils/simd_32/simd_altivec.h')
diff --git a/src/utils/simd_32/simd_altivec.h b/src/utils/simd_32/simd_altivec.h
new file mode 100644
index 000000000..d6aaa699d
--- /dev/null
+++ b/src/utils/simd_32/simd_altivec.h
@@ -0,0 +1,178 @@
+/**
+* Altivec SIMD
+*/
+
+#ifndef BOTAN_SIMD_ALTIVEC_H__
+#define BOTAN_SIMD_ALTIVEC_H__
+
+#include <botan/loadstor.h>
+#include <altivec.h>
+#undef vector
+
+namespace Botan {
+
+class SIMD_Altivec
+   {
+   public:
+
+      SIMD_Altivec(const u32bit B[4])
+         {
+         reg = (__vector unsigned int){B[0], B[1], B[2], B[3]};
+         }
+
+      SIMD_Altivec(u32bit B0, u32bit B1, u32bit B2, u32bit B3)
+         {
+         reg = (__vector unsigned int){B0, B1, B2, B3};
+         }
+
+      SIMD_Altivec(u32bit B)
+         {
+         reg = (__vector unsigned int){B, B, B, B};
+         }
+
+      static SIMD_Altivec load_le(const void* in)
+         {
+         const u32bit* in_32 = static_cast<const u32bit*>(in);
+
+         __vector unsigned int R0 = vec_ld(0, in_32);
+         __vector unsigned int R1 = vec_ld(12, in_32);
+
+         __vector unsigned char perm = vec_lvsl(0, in_32);
+
+         perm = vec_xor(perm, vec_splat_u8(3));
+
+         R0 = vec_perm(R0, R1, perm);
+
+         return SIMD_Altivec(R0);
+         }
+
+      static SIMD_Altivec load_be(const void* in)
+         {
+         const u32bit* in_32 = static_cast<const u32bit*>(in);
+
+         __vector unsigned int R0 = vec_ld(0, in_32);
+         __vector unsigned int R1 = vec_ld(12, in_32);
+
+         __vector unsigned char perm = vec_lvsl(0, in_32);
+
+         R0 = vec_perm(R0, R1, perm);
+
+         return SIMD_Altivec(R0);
+         }
+
+      void store_le(byte out[]) const
+         {
+         u32bit* out_32 = reinterpret_cast<u32bit*>(out);
+
+         __vector unsigned char perm = vec_lvsl(0, (int*)0);
+
+         perm = vec_xor(perm, vec_splat_u8(3));
+
+         __vector unsigned int swapped = vec_perm(reg, reg, perm);
+
+         vec_st(swapped, 0, out_32);
+         }
+
+      void store_be(byte out[]) const
+         {
+         u32bit* out_32 = reinterpret_cast<u32bit*>(out);
+         vec_st(reg, 0, out_32);
+         }
+
+      void rotate_left(u32bit rot)
+         {
+         __vector unsigned int rot_vec =
+            (__vector unsigned int){rot, rot, rot, rot};
+
+         reg = vec_rl(reg, rot_vec);
+         }
+
+      void rotate_right(u32bit rot)
+         {
+         rotate_left(32 - rot);
+         }
+
+      void operator+=(const SIMD_Altivec& other)
+         {
+         reg = vec_add(reg, other.reg);
+         }
+
+      SIMD_Altivec operator+(const SIMD_Altivec& other) const
+         {
+         return vec_add(reg, other.reg);
+         }
+
+      void operator-=(const SIMD_Altivec& other)
+         {
+         reg = vec_sub(reg, other.reg);
+         }
+
+      SIMD_Altivec operator-(const SIMD_Altivec& other) const
+         {
+         return vec_sub(reg, other.reg);
+         }
+
+      void operator^=(const SIMD_Altivec& other)
+         {
+         reg = vec_xor(reg, other.reg);
+         }
+
+      SIMD_Altivec operator^(const SIMD_Altivec& other) const
+         {
+         return vec_xor(reg, other.reg);
+         }
+
+      void operator|=(const SIMD_Altivec& other)
+         {
+         reg = vec_or(reg, other.reg);
+         }
+
+      void operator&=(const SIMD_Altivec& other)
+         {
+         reg = vec_and(reg, other.reg);
+         }
+
+      SIMD_Altivec operator<<(u32bit shift) const
+         {
+         __vector unsigned int shift_vec =
+            (__vector unsigned int){shift, shift, shift, shift};
+
+         return vec_sl(reg, shift_vec);
+         }
+
+      SIMD_Altivec operator>>(u32bit shift) const
+         {
+         __vector unsigned int shift_vec =
+            (__vector unsigned int){shift, shift, shift, shift};
+
+         return vec_sr(reg, shift_vec);
+         }
+
+      SIMD_Altivec operator~() const
+         {
+         return vec_nor(reg, reg);
+         }
+
+      static void transpose(SIMD_Altivec& B0, SIMD_Altivec& B1,
+                            SIMD_Altivec& B2, SIMD_Altivec& B3)
+         {
+         __vector unsigned int T0 = vec_mergeh(B0.reg, B2.reg);
+         __vector unsigned int T1 = vec_mergel(B0.reg, B2.reg);
+         __vector unsigned int T2 = vec_mergeh(B1.reg, B3.reg);
+         __vector unsigned int T3 = vec_mergel(B1.reg, B3.reg);
+
+         B0.reg = vec_mergeh(T0, T2);
+         B1.reg = vec_mergel(T0, T2);
+         B2.reg = vec_mergeh(T1, T3);
+         B3.reg = vec_mergel(T1, T3);
+         }
+
+   private:
+      SIMD_Altivec(__vector unsigned int input) { reg = input; }
+
+      __vector unsigned int reg;
+   };
+
+}
+
+#endif
-- 
cgit v1.2.3


From 16adc6a5424aad033a18bae872586a52e54d7d8e Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Wed, 28 Oct 2009 19:55:44 +0000
Subject: Add copyright + license on the new SIMD files

---
 src/utils/simd_32/simd_32.h      | 3 +++
 src/utils/simd_32/simd_altivec.h | 5 ++++-
 src/utils/simd_32/simd_scalar.h  | 5 ++++-
 src/utils/simd_32/simd_sse.h     | 3 +++
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'src/utils/simd_32/simd_altivec.h')

diff --git a/src/utils/simd_32/simd_32.h b/src/utils/simd_32/simd_32.h
index d9fac0d3d..be426efd6 100644
--- a/src/utils/simd_32/simd_32.h
+++ b/src/utils/simd_32/simd_32.h
@@ -1,5 +1,8 @@
 /**
 * Lightweight wrappers for SIMD operations
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
 */
 
 #ifndef BOTAN_SIMD_32_H__
diff --git a/src/utils/simd_32/simd_altivec.h b/src/utils/simd_32/simd_altivec.h
index d6aaa699d..a925f6dbc 100644
--- a/src/utils/simd_32/simd_altivec.h
+++ b/src/utils/simd_32/simd_altivec.h
@@ -1,5 +1,8 @@
 /**
-* Altivec SIMD
+* Lightweight wrappers around AltiVec for 32-bit operations
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
 */
 
 #ifndef BOTAN_SIMD_ALTIVEC_H__
diff --git a/src/utils/simd_32/simd_scalar.h b/src/utils/simd_32/simd_scalar.h
index 4b81c183b..38f69c294 100644
--- a/src/utils/simd_32/simd_scalar.h
+++ b/src/utils/simd_32/simd_scalar.h
@@ -1,5 +1,8 @@
 /**
-* Scalar emulation of SIMD operations
+* Scalar emulation of SIMD 32-bit operations
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
 */
 
 #ifndef BOTAN_SIMD_SCALAR_H__
diff --git a/src/utils/simd_32/simd_sse.h b/src/utils/simd_32/simd_sse.h
index d32ffdc2e..267852554 100644
--- a/src/utils/simd_32/simd_sse.h
+++ b/src/utils/simd_32/simd_sse.h
@@ -1,5 +1,8 @@
 /**
 * Lightweight wrappers for SSE2 intrinsics for 32-bit operations
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
 */
 
 #ifndef BOTAN_SIMD_SSE_H__
-- 
cgit v1.2.3


From e9cb78ddb6ad81e562fd466481b8d93e5144e7a6 Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Thu, 29 Oct 2009 02:04:05 +0000
Subject: Use register writes in the Altivec code for stores because Altivec's
 handling for unaligned writes is messy as hell.

If writes are batched this is somewhat easier to deal with (somewhat).
---
 src/utils/simd_32/simd_altivec.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'src/utils/simd_32/simd_altivec.h')

diff --git a/src/utils/simd_32/simd_altivec.h b/src/utils/simd_32/simd_altivec.h
index a925f6dbc..e63b9bdcc 100644
--- a/src/utils/simd_32/simd_altivec.h
+++ b/src/utils/simd_32/simd_altivec.h
@@ -65,21 +65,30 @@ class SIMD_Altivec
 
       void store_le(byte out[]) const
          {
-         u32bit* out_32 = reinterpret_cast<u32bit*>(out);
-
-         __vector unsigned char perm = vec_lvsl(0, (int*)0);
+         __vector unsigned char perm = vec_lvsl(0, (u32bit*)0);
 
          perm = vec_xor(perm, vec_splat_u8(3));
 
-         __vector unsigned int swapped = vec_perm(reg, reg, perm);
+         union {
+            __vector unsigned int V;
+            u32bit R[4];
+            } vec;
+
+         vec.V = vec_perm(reg, reg, perm);
 
-         vec_st(swapped, 0, out_32);
+         Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
          }
 
       void store_be(byte out[]) const
          {
-         u32bit* out_32 = reinterpret_cast<u32bit*>(out);
-         vec_st(reg, 0, out_32);
+         union {
+            __vector unsigned int V;
+            u32bit R[4];
+            } vec;
+
+         vec.V = reg;
+
+         Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
          }
 
       void rotate_left(u32bit rot)
-- 
cgit v1.2.3


From 511f670f32f920ace6352c4216a4a124dc9b01ac Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Thu, 29 Oct 2009 04:40:49 +0000
Subject: Add new function enabled() to each of the SIMD_32 instantiations
 which returns true if they might plausibly work. AltiVec and SSE2 versions
 call into CPUID, scalar version always works.

---
 src/utils/simd_32/simd_altivec.h | 3 +++
 src/utils/simd_32/simd_scalar.h  | 2 ++
 src/utils/simd_32/simd_sse.h     | 5 ++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'src/utils/simd_32/simd_altivec.h')

diff --git a/src/utils/simd_32/simd_altivec.h b/src/utils/simd_32/simd_altivec.h
index e63b9bdcc..c3d1a76dc 100644
--- a/src/utils/simd_32/simd_altivec.h
+++ b/src/utils/simd_32/simd_altivec.h
@@ -9,6 +9,8 @@
 #define BOTAN_SIMD_ALTIVEC_H__
 
 #include <botan/loadstor.h>
+#include <botan/cpuid.h>
+
 #include <altivec.h>
 #undef vector
 
@@ -17,6 +19,7 @@ namespace Botan {
 class SIMD_Altivec
    {
    public:
+      bool enabled() const { return CPUID::has_altivec(); }
 
       SIMD_Altivec(const u32bit B[4])
          {
diff --git a/src/utils/simd_32/simd_scalar.h b/src/utils/simd_32/simd_scalar.h
index 38f69c294..606923289 100644
--- a/src/utils/simd_32/simd_scalar.h
+++ b/src/utils/simd_32/simd_scalar.h
@@ -15,6 +15,8 @@ namespace Botan {
 class SIMD_Scalar
    {
    public:
+      bool enabled() const { return true; }
+
       SIMD_Scalar(const u32bit B[4])
          {
          R0 = B[0];
diff --git a/src/utils/simd_32/simd_sse.h b/src/utils/simd_32/simd_sse.h
index 267852554..fcfe6f203 100644
--- a/src/utils/simd_32/simd_sse.h
+++ b/src/utils/simd_32/simd_sse.h
@@ -8,7 +8,8 @@
 #ifndef BOTAN_SIMD_SSE_H__
 #define BOTAN_SIMD_SSE_H__
 
-#include <botan/types.h>
+#include <botan/cpuid.h>
+
 #include <emmintrin.h>
 
 namespace Botan {
@@ -16,6 +17,8 @@ namespace Botan {
 class SIMD_SSE2
    {
    public:
+      bool enabled() const { return CPUID::has_sse2(); }
+
       SIMD_SSE2(const u32bit B[4])
          {
          reg = _mm_loadu_si128((const __m128i*)B);
-- 
cgit v1.2.3


From 1cb38d792c7784133f0c022f09c33d02098c9291 Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Thu, 29 Oct 2009 04:57:59 +0000
Subject: Give each version of SIMD_32 a public bswap()

---
 src/utils/simd_32/simd_altivec.h |  9 +++++++++
 src/utils/simd_32/simd_scalar.h  |  9 +++++++++
 src/utils/simd_32/simd_sse.h     | 22 +++++++++++-----------
 3 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'src/utils/simd_32/simd_altivec.h')

diff --git a/src/utils/simd_32/simd_altivec.h b/src/utils/simd_32/simd_altivec.h
index c3d1a76dc..e1aa62002 100644
--- a/src/utils/simd_32/simd_altivec.h
+++ b/src/utils/simd_32/simd_altivec.h
@@ -168,6 +168,15 @@ class SIMD_Altivec
          return vec_nor(reg, reg);
          }
 
+      SIMD_Altivec bswap() const
+         {
+         __vector unsigned char perm = vec_lvsl(0, (u32bit*)0);
+
+         perm = vec_xor(perm, vec_splat_u8(3));
+
+         return SIMD_Altivec(vec_perm(reg, reg, perm));
+         }
+
       static void transpose(SIMD_Altivec& B0, SIMD_Altivec& B1,
                             SIMD_Altivec& B2, SIMD_Altivec& B3)
          {
diff --git a/src/utils/simd_32/simd_scalar.h b/src/utils/simd_32/simd_scalar.h
index 606923289..5fc20b462 100644
--- a/src/utils/simd_32/simd_scalar.h
+++ b/src/utils/simd_32/simd_scalar.h
@@ -9,6 +9,7 @@
 #define BOTAN_SIMD_SCALAR_H__
 
 #include <botan/loadstor.h>
+#include <botan/bswap.h>
 
 namespace Botan {
 
@@ -170,6 +171,14 @@ class SIMD_Scalar
          return SIMD_Scalar(~R0, ~R1, ~R2, ~R3);
          }
 
+      SIMD_Scalar bswap() const
+         {
+         return SIMD_Scalar(reverse_bytes(R0),
+                            reverse_bytes(R1),
+                            reverse_bytes(R2),
+                            reverse_bytes(R3));
+         }
+
       static void transpose(SIMD_Scalar& B0, SIMD_Scalar& B1,
                             SIMD_Scalar& B2, SIMD_Scalar& B3)
          {
diff --git a/src/utils/simd_32/simd_sse.h b/src/utils/simd_32/simd_sse.h
index fcfe6f203..c45d8032f 100644
--- a/src/utils/simd_32/simd_sse.h
+++ b/src/utils/simd_32/simd_sse.h
@@ -121,6 +121,17 @@ class SIMD_SSE2
          return _mm_xor_si128(reg, all_ones);
          }
 
+      SIMD_SSE2 bswap() const
+         {
+         __m128i T = reg;
+
+         T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
+         T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
+
+         return _mm_or_si128(_mm_srli_epi16(T, 8),
+                             _mm_slli_epi16(T, 8));
+         }
+
       static void transpose(SIMD_SSE2& B0, SIMD_SSE2& B1,
                             SIMD_SSE2& B2, SIMD_SSE2& B3)
          {
@@ -137,17 +148,6 @@ class SIMD_SSE2
    private:
       SIMD_SSE2(__m128i in) { reg = in; }
 
-      SIMD_SSE2 bswap() const
-         {
-         __m128i T = reg;
-
-         T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
-         T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
-
-         return _mm_or_si128(_mm_srli_epi16(T, 8),
-                             _mm_slli_epi16(T, 8));
-         }
-
       __m128i reg;
    };
 
-- 
cgit v1.2.3