Write functions to handle loading and saving words a block at a time, taking into

account endian differences. The current code does not take advantage of the knowledge of which endianness we are running on; an optimization suggested by Yves Jerschow is to use (unsafe) casts to speed up the load/store operations. This turns out to provide large performance increases (30% or more) in some cases. Even without the unsafe casts, this version seems to average a few percent faster, probably because the longer loading loops have been partially or fully unrolled. This also makes the code implementing low-level algorithms like ciphers and hashes a bit more succint.
author: lloyd <[email protected]> 2007-05-31 03:25:19 +0000
committer: lloyd <[email protected]> 2007-05-31 03:25:19 +0000
commit: 55608e7dd1aa593944f967f2549564e4f42b654e (patch)
tree: ec2ec03a762a6dac82eb608487d5394370135624
parent: 22ecdc45a0efa4c444d0b7010b7cd743aeb68c57 (diff)
41 files changed, 449 insertions, 431 deletions
diff --git a/include/adler32.h b/include/adler32.h
index 4a6975e90..4beb3bdb6 100644
--- a/include/adler32.h
+++ b/include/adler32.h
@@ -25,7 +25,7 @@ class Adler32 : public HashFunction
       void add_data(const byte[], u32bit);
       void final_result(byte[]);
       void hash(const byte[], u32bit);
-      u32bit S1, S2;
+      u16bit S1, S2;
    };
 
 }
diff --git a/include/bit_ops.h b/include/bit_ops.h
index 34f7365b2..0636cac41 100644
--- a/include/bit_ops.h
+++ b/include/bit_ops.h
@@ -59,6 +59,146 @@ u32bit low_bit(u64bit);
 u32bit significant_bytes(u64bit);
 u32bit hamming_weight(u64bit);
 
+/*************************************************
+* Endian-Specific Word Loading Operations        *
+*************************************************/
+template<typename T>
+inline T load_be(const byte in[], u32bit off)
+   {
+   in += off * sizeof(T);
+   T out = 0;
+   for(u32bit j = 0; j != sizeof(T); j++)
+      out = (out << 8) | in[j];
+   return out;
+   }
+
+template<typename T>
+inline T load_le(const byte in[], u32bit off)
+   {
+   in += off * sizeof(T);
+   T out = 0;
+   for(u32bit j = 0; j != sizeof(T); j++)
+      out = (out << 8) | in[sizeof(T)-1-j];
+   return out;
+   }
+
+template<>
+inline u32bit load_be<u32bit>(const byte in[], u32bit off)
+   {
+   in += off * sizeof(u32bit);
+   return make_u32bit(in[0], in[1], in[2], in[3]);
+   }
+
+template<>
+inline u32bit load_le<u32bit>(const byte in[], u32bit off)
+   {
+   in += off * sizeof(u32bit);
+   return make_u32bit(in[3], in[2], in[1], in[0]);
+   }
+
+template<>
+inline u64bit load_be<u64bit>(const byte in[], u32bit off)
+   {
+   in += off * sizeof(u64bit);
+   return make_u64bit(in[0], in[1], in[2], in[3],
+                      in[4], in[5], in[6], in[7]);
+   }
+
+template<>
+inline u64bit load_le<u64bit>(const byte in[], u32bit off)
+   {
+   in += off * sizeof(u64bit);
+   return make_u64bit(in[7], in[6], in[5], in[4],
+                      in[3], in[2], in[1], in[0]);
+   }
+
+/*************************************************
+* Endian-Specific Word Storing Operations        *
+*************************************************/
+inline void store_be(u16bit in, byte out[2])
+   {
+   out[0] = get_byte(0, in);
+   out[1] = get_byte(1, in);
+   }
+
+inline void store_le(u16bit in, byte out[2])
+   {
+   out[0] = get_byte(1, in);
+   out[1] = get_byte(0, in);
+   }
+
+inline void store_be(u32bit in, byte out[4])
+   {
+   out[0] = get_byte(0, in);
+   out[1] = get_byte(1, in);
+   out[2] = get_byte(2, in);
+   out[3] = get_byte(3, in);
+   }
+
+inline void store_le(u32bit in, byte out[4])
+   {
+   out[0] = get_byte(3, in);
+   out[1] = get_byte(2, in);
+   out[2] = get_byte(1, in);
+   out[3] = get_byte(0, in);
+   }
+
+inline void store_be(u64bit in, byte out[8])
+   {
+   out[0] = get_byte(0, in);
+   out[1] = get_byte(1, in);
+   out[2] = get_byte(2, in);
+   out[3] = get_byte(3, in);
+   out[4] = get_byte(4, in);
+   out[5] = get_byte(5, in);
+   out[6] = get_byte(6, in);
+   out[7] = get_byte(7, in);
+   }
+
+inline void store_le(u64bit in, byte out[8])
+   {
+   out[0] = get_byte(7, in);
+   out[1] = get_byte(6, in);
+   out[2] = get_byte(5, in);
+   out[3] = get_byte(4, in);
+   out[4] = get_byte(3, in);
+   out[5] = get_byte(2, in);
+   out[6] = get_byte(1, in);
+   out[7] = get_byte(0, in);
+   }
+
+template<typename T>
+inline void store_le(byte out[], T a, T b)
+   {
+   store_le(a, out + (0 * sizeof(T)));
+   store_le(b, out + (1 * sizeof(T)));
+   }
+
+template<typename T>
+inline void store_be(byte out[], T a, T b)
+   {
+   store_be(a, out + (0 * sizeof(T)));
+   store_be(b, out + (1 * sizeof(T)));
+   }
+
+template<typename T>
+inline void store_le(byte out[], T a, T b, T c, T d)
+   {
+   store_le(a, out + (0 * sizeof(T)));
+   store_le(b, out + (1 * sizeof(T)));
+   store_le(c, out + (2 * sizeof(T)));
+   store_le(d, out + (3 * sizeof(T)));
+   }
+
+template<typename T>
+inline void store_be(byte out[], T a, T b, T c, T d)
+   {
+   store_be(a, out + (0 * sizeof(T)));
+   store_be(b, out + (1 * sizeof(T)));
+   store_be(c, out + (2 * sizeof(T)));
+   store_be(d, out + (3 * sizeof(T)));
+   }
+
 }
 
 #endif
diff --git a/include/botan.h b/include/botan.h
index 8ae97ace3..007bf411e 100644
--- a/include/botan.h
+++ b/include/botan.h
@@ -9,5 +9,4 @@
 #include <botan/lookup.h>
 #include <botan/rng.h>
 #include <botan/version.h>
-#include <botan/bit_ops.h>
 #include <botan/parsing.h>
diff --git a/include/cast256.h b/include/cast256.h
index d55344e99..152580cc5 100644
--- a/include/cast256.h
+++ b/include/cast256.h
@@ -24,11 +24,10 @@ class CAST_256 : public BlockCipher
       void enc(const byte[], byte[]) const;
       void dec(const byte[], byte[]) const;
       void key(const byte[], u32bit);
-      void round1(u32bit&, u32bit, u32bit, u32bit) const;
-      void round2(u32bit&, u32bit, u32bit, u32bit) const;
-      void round3(u32bit&, u32bit, u32bit, u32bit) const;
+
       static const u32bit KEY_MASK[192];
       static const byte   KEY_ROT[32];
+
       SecureBuffer<u32bit, 48> MK;
       SecureBuffer<byte, 48> RK;
    };
diff --git a/include/square.h b/include/square.h
index 4070f9c47..702fabad5 100644
--- a/include/square.h
+++ b/include/square.h
@@ -24,11 +24,13 @@ class Square : public BlockCipher
       void enc(const byte[], byte[]) const;
       void dec(const byte[], byte[]) const;
       void key(const byte[], u32bit);
+
       static void transform(u32bit[4]);
-      static byte mul(byte, byte);
+
       static const byte SE[256], SD[256], Log[256], ALog[255];
       static const u32bit TE0[256], TE1[256], TE2[256], TE3[256],
                           TD0[256], TD1[256], TD2[256], TD3[256];
+
       SecureBuffer<u32bit, 28> EK, DK;
       SecureBuffer<byte, 32> ME, MD;
    };
diff --git a/src/adler32.cpp b/src/adler32.cpp
index b27fa247b..51f6cbea3 100644
--- a/src/adler32.cpp
+++ b/src/adler32.cpp
@@ -65,10 +65,7 @@ void Adler32::add_data(const byte input[], u32bit length)
 *************************************************/
 void Adler32::final_result(byte output[])
    {
-   output[0] = get_byte(2, S2);
-   output[1] = get_byte(3, S2);
-   output[2] = get_byte(2, S1);
-   output[3] = get_byte(3, S1);
+   store_be(output, S2, S1);
    clear();
    }
 
diff --git a/src/aes.cpp b/src/aes.cpp
index f89ebf529..b345763cb 100644
--- a/src/aes.cpp
+++ b/src/aes.cpp
@@ -134,7 +134,8 @@ void AES::key(const byte key[], u32bit length)
 
    const u32bit X = length / 4;
    for(u32bit j = 0; j != X; ++j)
-      XEK[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      XEK[j] = load_be<u32bit>(key, j);
+
    for(u32bit j = X; j < 4*(ROUNDS+1); j += X)
       {
       XEK[j] = XEK[j-X] ^ S(rotate_left(XEK[j-1], 8)) ^ RC[(j-X)/X];
diff --git a/src/blowfish.cpp b/src/blowfish.cpp
index 384a12461..8fd43ee7e 100644
--- a/src/blowfish.cpp
+++ b/src/blowfish.cpp
@@ -13,8 +13,8 @@ namespace Botan {
 *************************************************/
 void Blowfish::enc(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0);
+   u32bit R = load_be<u32bit>(in, 1);
 
    for(u32bit j = 0; j != 16; j += 2)
       {
@@ -29,10 +29,7 @@ void Blowfish::enc(const byte in[], byte out[]) const
 
    L ^= P[16]; R ^= P[17];
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -40,8 +37,8 @@ void Blowfish::enc(const byte in[], byte out[]) const
 *************************************************/
 void Blowfish::dec(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0);
+   u32bit R = load_be<u32bit>(in, 1);
 
    for(u32bit j = 17; j != 1; j -= 2)
       {
@@ -56,10 +53,7 @@ void Blowfish::dec(const byte in[], byte out[]) const
 
    L ^= P[1]; R ^= P[0];
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -68,9 +62,11 @@ void Blowfish::dec(const byte in[], byte out[]) const
 void Blowfish::key(const byte key[], u32bit length)
    {
    clear();
+
    for(u32bit j = 0, k = 0; j != 18; ++j, k += 4)
       P[j] ^= make_u32bit(key[(k  ) % length], key[(k+1) % length],
-                             key[(k+2) % length], key[(k+3) % length]);
+                          key[(k+2) % length], key[(k+3) % length]);
+
    u32bit L = 0, R = 0;
    generate_sbox(P,  18,  L, R);
    generate_sbox(S1, 256, L, R);
diff --git a/src/cast128.cpp b/src/cast128.cpp
index ec87afb08..481897810 100644
--- a/src/cast128.cpp
+++ b/src/cast128.cpp
@@ -47,8 +47,8 @@ inline void R3(u32bit& L, u32bit R, u32bit MK, u32bit RK)
 *************************************************/
 void CAST_128::enc(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0);
+   u32bit R = load_be<u32bit>(in, 1);
 
    R1(L, R, MK[ 0], RK[ 0]);
    R2(R, L, MK[ 1], RK[ 1]);
@@ -67,10 +67,7 @@ void CAST_128::enc(const byte in[], byte out[]) const
    R3(L, R, MK[14], RK[14]);
    R1(R, L, MK[15], RK[15]);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -78,8 +75,8 @@ void CAST_128::enc(const byte in[], byte out[]) const
 *************************************************/
 void CAST_128::dec(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0);
+   u32bit R = load_be<u32bit>(in, 1);
 
    R1(L, R, MK[15], RK[15]);
    R3(R, L, MK[14], RK[14]);
@@ -98,10 +95,7 @@ void CAST_128::dec(const byte in[], byte out[]) const
    R2(L, R, MK[ 1], RK[ 1]);
    R1(R, L, MK[ 0], RK[ 0]);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
diff --git a/src/cast256.cpp b/src/cast256.cpp
index 21ccbea63..1064ed0de 100644
--- a/src/cast256.cpp
+++ b/src/cast256.cpp
@@ -8,15 +8,50 @@
 
 namespace Botan {
 
+namespace {
+
+/*************************************************
+* CAST-256 Round Type 1                          *
+*************************************************/
+void round1(u32bit& out, u32bit in, u32bit mask, u32bit rot)
+   {
+   u32bit temp = rotate_left(mask + in, rot);
+   out  ^= (CAST_SBOX1[get_byte(0, temp)] ^ CAST_SBOX2[get_byte(1, temp)]) -
+            CAST_SBOX3[get_byte(2, temp)] + CAST_SBOX4[get_byte(3, temp)];
+   }
+
+/*************************************************
+* CAST-256 Round Type 2                          *
+*************************************************/
+void round2(u32bit& out, u32bit in, u32bit mask, u32bit rot)
+   {
+   u32bit temp = rotate_left(mask ^ in, rot);
+   out  ^= (CAST_SBOX1[get_byte(0, temp)]  - CAST_SBOX2[get_byte(1, temp)] +
+            CAST_SBOX3[get_byte(2, temp)]) ^ CAST_SBOX4[get_byte(3, temp)];
+   }
+
+/*************************************************
+* CAST-256 Round Type 3                          *
+*************************************************/
+void round3(u32bit& out, u32bit in, u32bit mask, u32bit rot)
+   {
+   u32bit temp = rotate_left(mask - in, rot);
+   out  ^= ((CAST_SBOX1[get_byte(0, temp)]  + CAST_SBOX2[get_byte(1, temp)]) ^
+             CAST_SBOX3[get_byte(2, temp)]) - CAST_SBOX4[get_byte(3, temp)];
+   }
+
+}
+
 /*************************************************
 * CAST-256 Encryption                            *
 *************************************************/
 void CAST_256::enc(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 0], in[ 1], in[ 2], in[ 3]),
-          B = make_u32bit(in[ 4], in[ 5], in[ 6], in[ 7]),
-          C = make_u32bit(in[ 8], in[ 9], in[10], in[11]),
-          D = make_u32bit(in[12], in[13], in[14], in[15]);
+   u32bit A = load_be<u32bit>(in, 0);
+   u32bit B = load_be<u32bit>(in, 1);
+   u32bit C = load_be<u32bit>(in, 2);
+   u32bit D = load_be<u32bit>(in, 3);
+
    round1(C, D, MK[ 0], RK[ 0]); round2(B, C, MK[ 1], RK[ 1]);
    round3(A, B, MK[ 2], RK[ 2]); round1(D, A, MK[ 3], RK[ 3]);
    round1(C, D, MK[ 4], RK[ 4]); round2(B, C, MK[ 5], RK[ 5]);
@@ -41,14 +76,8 @@ void CAST_256::enc(const byte in[], byte out[]) const
    round2(B, C, MK[41], RK[41]); round1(C, D, MK[40], RK[40]);
    round1(D, A, MK[47], RK[47]); round3(A, B, MK[46], RK[46]);
    round2(B, C, MK[45], RK[45]); round1(C, D, MK[44], RK[44]);
-   out[ 0] = get_byte(0, A); out[ 1] = get_byte(1, A);
-   out[ 2] = get_byte(2, A); out[ 3] = get_byte(3, A);
-   out[ 4] = get_byte(0, B); out[ 5] = get_byte(1, B);
-   out[ 6] = get_byte(2, B); out[ 7] = get_byte(3, B);
-   out[ 8] = get_byte(0, C); out[ 9] = get_byte(1, C);
-   out[10] = get_byte(2, C); out[11] = get_byte(3, C);
-   out[12] = get_byte(0, D); out[13] = get_byte(1, D);
-   out[14] = get_byte(2, D); out[15] = get_byte(3, D);
+
+   store_be(out, A, B, C, D);
    }
 
 /*************************************************
@@ -56,10 +85,11 @@ void CAST_256::enc(const byte in[], byte out[]) const
 *************************************************/
 void CAST_256::dec(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 0], in[ 1], in[ 2], in[ 3]),
-          B = make_u32bit(in[ 4], in[ 5], in[ 6], in[ 7]),
-          C = make_u32bit(in[ 8], in[ 9], in[10], in[11]),
-          D = make_u32bit(in[12], in[13], in[14], in[15]);
+   u32bit A = load_be<u32bit>(in, 0);
+   u32bit B = load_be<u32bit>(in, 1);
+   u32bit C = load_be<u32bit>(in, 2);
+   u32bit D = load_be<u32bit>(in, 3);
+
    round1(C, D, MK[44], RK[44]); round2(B, C, MK[45], RK[45]);
    round3(A, B, MK[46], RK[46]); round1(D, A, MK[47], RK[47]);
    round1(C, D, MK[40], RK[40]); round2(B, C, MK[41], RK[41]);
@@ -84,44 +114,8 @@ void CAST_256::dec(const byte in[], byte out[]) const
    round2(B, C, MK[ 5], RK[ 5]); round1(C, D, MK[ 4], RK[ 4]);
    round1(D, A, MK[ 3], RK[ 3]); round3(A, B, MK[ 2], RK[ 2]);
    round2(B, C, MK[ 1], RK[ 1]); round1(C, D, MK[ 0], RK[ 0]);
-   out[ 0] = get_byte(0, A); out[ 1] = get_byte(1, A);
-   out[ 2] = get_byte(2, A); out[ 3] = get_byte(3, A);
-   out[ 4] = get_byte(0, B); out[ 5] = get_byte(1, B);
-   out[ 6] = get_byte(2, B); out[ 7] = get_byte(3, B);
-   out[ 8] = get_byte(0, C); out[ 9] = get_byte(1, C);
-   out[10] = get_byte(2, C); out[11] = get_byte(3, C);
-   out[12] = get_byte(0, D); out[13] = get_byte(1, D);
-   out[14] = get_byte(2, D); out[15] = get_byte(3, D);
-   }
 
-/*************************************************
-* CAST-256 Round Type 1                          *
-*************************************************/
-void CAST_256::round1(u32bit& out, u32bit in, u32bit mask, u32bit rot) const
-   {
-   u32bit temp = rotate_left(mask + in, rot);
-   out  ^= (CAST_SBOX1[get_byte(0, temp)] ^ CAST_SBOX2[get_byte(1, temp)]) -
-            CAST_SBOX3[get_byte(2, temp)] + CAST_SBOX4[get_byte(3, temp)];
-   }
-
-/*************************************************
-* CAST-256 Round Type 2                          *
-*************************************************/
-void CAST_256::round2(u32bit& out, u32bit in, u32bit mask, u32bit rot) const
-   {
-   u32bit temp = rotate_left(mask ^ in, rot);
-   out  ^= (CAST_SBOX1[get_byte(0, temp)]  - CAST_SBOX2[get_byte(1, temp)] +
-            CAST_SBOX3[get_byte(2, temp)]) ^ CAST_SBOX4[get_byte(3, temp)];
-   }
-
-/*************************************************
-* CAST-256 Round Type 3                          *
-*************************************************/
-void CAST_256::round3(u32bit& out, u32bit in, u32bit mask, u32bit rot) const
-   {
-   u32bit temp = rotate_left(mask - in, rot);
-   out  ^= ((CAST_SBOX1[get_byte(0, temp)]  + CAST_SBOX2[get_byte(1, temp)]) ^
-             CAST_SBOX3[get_byte(2, temp)]) - CAST_SBOX4[get_byte(3, temp)];
+   store_be(out, A, B, C, D);
    }
 
 /*************************************************
diff --git a/src/crc32.cpp b/src/crc32.cpp
index b10a2eebe..e897cbc02 100644
--- a/src/crc32.cpp
+++ b/src/crc32.cpp
@@ -93,8 +93,7 @@ void CRC32::add_data(const byte input[], u32bit length)
 void CRC32::final_result(byte output[])
    {
    crc ^= 0xFFFFFFFF;
-   for(u32bit j = 0; j != 4; ++j)
-      output[j] = get_byte(j, crc);
+   store_be(crc, output);
    clear();
    }
 
diff --git a/src/des.cpp b/src/des.cpp
index 47bdb8f56..e8f173c5c 100644
--- a/src/des.cpp
+++ b/src/des.cpp
@@ -13,17 +13,13 @@ namespace Botan {
 *************************************************/
 void DES::enc(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
 
    IP(L, R);
    raw_encrypt(L, R);
    FP(L, R);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -31,17 +27,13 @@ void DES::enc(const byte in[], byte out[]) const
 *************************************************/
 void DES::dec(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
 
    IP(L, R);
    raw_decrypt(L, R);
    FP(L, R);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -193,8 +185,7 @@ void DES::key(const byte key[], u32bit)
 *************************************************/
 void TripleDES::enc(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
 
    DES::IP(L, R);
    des1.raw_encrypt(L, R);
@@ -202,10 +193,7 @@ void TripleDES::enc(const byte in[], byte out[]) const
    des3.raw_encrypt(L, R);
    DES::FP(L, R);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
@@ -213,8 +201,7 @@ void TripleDES::enc(const byte in[], byte out[]) const
 *************************************************/
 void TripleDES::dec(const byte in[], byte out[]) const
    {
-   u32bit L = make_u32bit(in[0], in[1], in[2], in[3]),
-          R = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
 
    DES::IP(L, R);
    des3.raw_decrypt(L, R);
@@ -222,10 +209,7 @@ void TripleDES::dec(const byte in[], byte out[]) const
    des1.raw_decrypt(L, R);
    DES::FP(L, R);
 
-   out[0] = get_byte(0, R); out[1] = get_byte(1, R);
-   out[2] = get_byte(2, R); out[3] = get_byte(3, R);
-   out[4] = get_byte(0, L); out[5] = get_byte(1, L);
-   out[6] = get_byte(2, L); out[7] = get_byte(3, L);
+   store_be(out, R, L);
    }
 
 /*************************************************
diff --git a/src/fork256.cpp b/src/fork256.cpp
index abd8e98b3..af2d7a756 100644
--- a/src/fork256.cpp
+++ b/src/fork256.cpp
@@ -62,7 +62,7 @@ void FORK_256::hash(const byte input[])
    H1 = H2 = H3 = H4 = digest[7];
 
    for(u32bit j = 0; j != 16; ++j)
-      M[j] = make_u32bit(input[4*j], input[4*j+1], input[4*j+2], input[4*j+3]);
+      M[j] = load_be<u32bit>(input, j);
 
    step(A1, B1, C1, D1, E1, F1, G1, H1, M[ 0], M[ 1], DELTA[ 0], DELTA[ 1]);
    step(A2, B2, C2, D2, E2, F2, G2, H2, M[14], M[15], DELTA[15], DELTA[14]);
@@ -119,8 +119,8 @@ void FORK_256::hash(const byte input[])
 *************************************************/
 void FORK_256::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(j % 4, digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_be(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/gost.cpp b/src/gost.cpp
index a8a295c4b..d999d0d2d 100644
--- a/src/gost.cpp
+++ b/src/gost.cpp
@@ -13,8 +13,7 @@ namespace Botan {
 *************************************************/
 void GOST::enc(const byte in[], byte out[]) const
    {
-   u32bit N1 = make_u32bit(in[3], in[2], in[1], in[0]),
-          N2 = make_u32bit(in[7], in[6], in[5], in[4]);
+   u32bit N1 = load_le<u32bit>(in, 0), N2 = load_le<u32bit>(in, 1);
 
    for(u32bit j = 0; j != 32; j += 2)
       {
@@ -29,10 +28,7 @@ void GOST::enc(const byte in[], byte out[]) const
             SBOX3[get_byte(2, T0)] | SBOX4[get_byte(3, T0)];
       }
 
-   out[0] = get_byte(3, N2); out[1] = get_byte(2, N2);
-   out[2] = get_byte(1, N2); out[3] = get_byte(0, N2);
-   out[4] = get_byte(3, N1); out[5] = get_byte(2, N1);
-   out[6] = get_byte(1, N1); out[7] = get_byte(0, N1);
+   store_le(out, N2, N1);
    }
 
 /*************************************************
@@ -40,8 +36,7 @@ void GOST::enc(const byte in[], byte out[]) const
 *************************************************/
 void GOST::dec(const byte in[], byte out[]) const
    {
-   u32bit N1 = make_u32bit(in[3], in[2], in[1], in[0]),
-          N2 = make_u32bit(in[7], in[6], in[5], in[4]);
+   u32bit N1 = load_le<u32bit>(in, 0), N2 = load_le<u32bit>(in, 1);
 
    for(u32bit j = 0; j != 32; j += 2)
       {
@@ -56,10 +51,7 @@ void GOST::dec(const byte in[], byte out[]) const
             SBOX3[get_byte(2, T0)] | SBOX4[get_byte(3, T0)];
       }
 
-   out[0] = get_byte(3, N2); out[1] = get_byte(2, N2);
-   out[2] = get_byte(1, N2); out[3] = get_byte(0, N2);
-   out[4] = get_byte(3, N1); out[5] = get_byte(2, N1);
-   out[6] = get_byte(1, N1); out[7] = get_byte(0, N1);
+   store_le(out, N2, N1);
    }
 
 /*************************************************
@@ -69,7 +61,7 @@ void GOST::key(const byte key[], u32bit)
    {
    for(u32bit j = 0; j != 8; ++j)
       {
-      u32bit K = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]);
+      u32bit K = load_le<u32bit>(key, j);
       EK[j] = EK[j+8] = EK[j+16] = K;
       }
 
diff --git a/src/has160.cpp b/src/has160.cpp
index ceb5a8b6b..7c6721903 100644
--- a/src/has160.cpp
+++ b/src/has160.cpp
@@ -58,7 +58,7 @@ inline void F4(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E,
 void HAS_160::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      X[j] = make_u32bit(input[4*j+3], input[4*j+2], input[4*j+1], input[4*j]);
+      X[j] = load_le<u32bit>(input, j);
 
    u32bit A = digest[0], B = digest[1], C = digest[2],
           D = digest[3], E = digest[4];
@@ -120,8 +120,8 @@ void HAS_160::hash(const byte input[])
 *************************************************/
 void HAS_160::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(3 - (j % 4), digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_le(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/idea.cpp b/src/idea.cpp
index 1f2facbb8..ed142ca9b 100644
--- a/src/idea.cpp
+++ b/src/idea.cpp
@@ -33,8 +33,10 @@ inline void mul(u16bit& a, u16bit b)
 *************************************************/
 void IDEA::enc(const byte in[], byte out[]) const
    {
-   u16bit X1 = make_u16bit(in[0], in[1]), X2 = make_u16bit(in[2], in[3]),
-          X3 = make_u16bit(in[4], in[5]), X4 = make_u16bit(in[6], in[7]);
+   u16bit X1 = load_be<u16bit>(in, 0);
+   u16bit X2 = load_be<u16bit>(in, 1);
+   u16bit X3 = load_be<u16bit>(in, 2);
+   u16bit X4 = load_be<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 8; ++j)
       {
@@ -57,10 +59,7 @@ void IDEA::enc(const byte in[], byte out[]) const
 
    mul(X1, EK[48]); X2 += EK[50]; X3 += EK[49]; mul(X4, EK[51]);
 
-   out[0] = get_byte(0, X1); out[1] = get_byte(1, X1);
-   out[2] = get_byte(0, X3); out[3] = get_byte(1, X3);
-   out[4] = get_byte(0, X2); out[5] = get_byte(1, X2);
-   out[6] = get_byte(0, X4); out[7] = get_byte(1, X4);
+   store_be(out, X1, X3, X2, X4);
    }
 
 /*************************************************
@@ -68,8 +67,11 @@ void IDEA::enc(const byte in[], byte out[]) const
 *************************************************/
 void IDEA::dec(const byte in[], byte out[]) const
    {
-   u16bit X1 = make_u16bit(in[0], in[1]), X2 = make_u16bit(in[2], in[3]),
-          X3 = make_u16bit(in[4], in[5]), X4 = make_u16bit(in[6], in[7]);
+   u16bit X1 = load_be<u16bit>(in, 0);
+   u16bit X2 = load_be<u16bit>(in, 1);
+   u16bit X3 = load_be<u16bit>(in, 2);
+   u16bit X4 = load_be<u16bit>(in, 3);
+
    for(u32bit j = 0; j != 8; ++j)
       {
       mul(X1, DK[6*j+0]);
@@ -91,10 +93,7 @@ void IDEA::dec(const byte in[], byte out[]) const
 
    mul(X1, DK[48]); X2 += DK[50]; X3 += DK[49]; mul(X4, DK[51]);
 
-   out[0] = get_byte(0, X1); out[1] = get_byte(1, X1);
-   out[2] = get_byte(0, X3); out[3] = get_byte(1, X3);
-   out[4] = get_byte(0, X2); out[5] = get_byte(1, X2);
-   out[6] = get_byte(0, X4); out[7] = get_byte(1, X4);
+   store_be(out, X1, X3, X2, X4);
    }
 
 /*************************************************
@@ -125,17 +124,20 @@ u16bit IDEA::mul_inv(u16bit x)
 void IDEA::key(const byte key[], u32bit)
    {
    for(u32bit j = 0; j != 8; ++j)
-      EK[j] = make_u16bit(key[2*j], key[2*j+1]);
+      EK[j] = load_be<u16bit>(key, j);
+
    for(u32bit j = 1, k = 8, offset = 0; k != 52; j %= 8, ++j, ++k)
       {
       EK[j+7+offset] = (u16bit)((EK[(j     % 8) + offset] << 9) |
                                 (EK[((j+1) % 8) + offset] >> 7));
       offset += (j == 8) ? 8 : 0;
       }
+
    DK[51] = mul_inv(EK[3]);
    DK[50] = (u16bit)-EK[2];
    DK[49] = (u16bit)-EK[1];
    DK[48] = mul_inv(EK[0]);
+
    for(u32bit j = 1, k = 4, counter = 47; j != 8; ++j, k += 6)
       {
       DK[counter--] = EK[k+1];
@@ -145,6 +147,7 @@ void IDEA::key(const byte key[], u32bit)
       DK[counter--] = (u16bit)-EK[k+4];
       DK[counter--] = mul_inv(EK[k+2]);
       }
+
    DK[5] = EK[47];
    DK[4] = EK[46];
    DK[3] = mul_inv(EK[51]);
diff --git a/src/kasumi.cpp b/src/kasumi.cpp
index 8730c45fc..43eff7311 100644
--- a/src/kasumi.cpp
+++ b/src/kasumi.cpp
@@ -33,8 +33,10 @@ u16bit FI(u16bit I, u16bit K)
 *************************************************/
 void KASUMI::enc(const byte in[], byte out[]) const
    {
-   u16bit B0 = make_u16bit(in[0], in[1]), B1 = make_u16bit(in[2], in[3]),
-          B2 = make_u16bit(in[4], in[5]), B3 = make_u16bit(in[6], in[7]);
+   u16bit B0 = load_be<u16bit>(in, 0);
+   u16bit B1 = load_be<u16bit>(in, 1);
+   u16bit B2 = load_be<u16bit>(in, 2);
+   u16bit B3 = load_be<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 8; j += 2)
       {
@@ -61,10 +63,7 @@ void KASUMI::enc(const byte in[], byte out[]) const
       B1 ^= R;
       }
 
-   out[0] = get_byte(0, B0); out[1] = get_byte(1, B0);
-   out[2] = get_byte(0, B1); out[3] = get_byte(1, B1);
-   out[4] = get_byte(0, B2); out[5] = get_byte(1, B2);
-   out[6] = get_byte(0, B3); out[7] = get_byte(1, B3);
+   store_be(out, B0, B1, B2, B3);
    }
 
 /*************************************************
@@ -72,8 +71,10 @@ void KASUMI::enc(const byte in[], byte out[]) const
 *************************************************/
 void KASUMI::dec(const byte in[], byte out[]) const
    {
-   u16bit B0 = make_u16bit(in[0], in[1]), B1 = make_u16bit(in[2], in[3]),
-          B2 = make_u16bit(in[4], in[5]), B3 = make_u16bit(in[6], in[7]);
+   u16bit B0 = load_be<u16bit>(in, 0);
+   u16bit B1 = load_be<u16bit>(in, 1);
+   u16bit B2 = load_be<u16bit>(in, 2);
+   u16bit B3 = load_be<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 8; j += 2)
       {
@@ -102,10 +103,7 @@ void KASUMI::dec(const byte in[], byte out[]) const
       B3 ^= R;
       }
 
-   out[0] = get_byte(0, B0); out[1] = get_byte(1, B0);
-   out[2] = get_byte(0, B1); out[3] = get_byte(1, B1);
-   out[4] = get_byte(0, B2); out[5] = get_byte(1, B2);
-   out[6] = get_byte(0, B3); out[7] = get_byte(1, B3);
+   store_be(out, B0, B1, B2, B3);
    }
 
 /*************************************************
@@ -119,7 +117,7 @@ void KASUMI::key(const byte key[], u32bit)
    SecureBuffer<u16bit, 16> K;
    for(u32bit j = 0; j != 8; ++j)
       {
-      K[j] = make_u16bit(key[2*j], key[2*j+1]);
+      K[j] = load_be<u16bit>(key, j);
       K[j+8] = K[j] ^ RC[j];
       }
 
diff --git a/src/mars.cpp b/src/mars.cpp
index cdc992da5..34327d717 100644
--- a/src/mars.cpp
+++ b/src/mars.cpp
@@ -49,10 +49,10 @@ u32bit gen_mask(u32bit input)
 *************************************************/
 void MARS::enc(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]) + EK[0],
-          B = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]) + EK[1],
-          C = make_u32bit(in[11], in[10], in[ 9], in[ 8]) + EK[2],
-          D = make_u32bit(in[15], in[14], in[13], in[12]) + EK[3];
+   u32bit A = load_le<u32bit>(in, 0) + EK[0];
+   u32bit B = load_le<u32bit>(in, 1) + EK[1];
+   u32bit C = load_le<u32bit>(in, 2) + EK[2];
+   u32bit D = load_le<u32bit>(in, 3) + EK[3];
 
    forward_mix(A, B, C, D);
 
@@ -78,14 +78,7 @@ void MARS::enc(const byte in[], byte out[]) const
 
    A -= EK[36]; B -= EK[37]; C -= EK[38]; D -= EK[39];
 
-   out[ 0] = get_byte(3, A); out[ 1] = get_byte(2, A);
-   out[ 2] = get_byte(1, A); out[ 3] = get_byte(0, A);
-   out[ 4] = get_byte(3, B); out[ 5] = get_byte(2, B);
-   out[ 6] = get_byte(1, B); out[ 7] = get_byte(0, B);
-   out[ 8] = get_byte(3, C); out[ 9] = get_byte(2, C);
-   out[10] = get_byte(1, C); out[11] = get_byte(0, C);
-   out[12] = get_byte(3, D); out[13] = get_byte(2, D);
-   out[14] = get_byte(1, D); out[15] = get_byte(0, D);
+   store_le(out, A, B, C, D);
    }
 
 /*************************************************
@@ -93,10 +86,10 @@ void MARS::enc(const byte in[], byte out[]) const
 *************************************************/
 void MARS::dec(const byte in[], byte out[]) const
    {
-   u32bit D = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]) + EK[36],
-          C = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]) + EK[37],
-          B = make_u32bit(in[11], in[10], in[ 9], in[ 8]) + EK[38],
-          A = make_u32bit(in[15], in[14], in[13], in[12]) + EK[39];
+   u32bit A = load_le<u32bit>(in, 3) + EK[39];
+   u32bit B = load_le<u32bit>(in, 2) + EK[38];
+   u32bit C = load_le<u32bit>(in, 1) + EK[37];
+   u32bit D = load_le<u32bit>(in, 0) + EK[36];
 
    forward_mix(A, B, C, D);
 
@@ -122,14 +115,7 @@ void MARS::dec(const byte in[], byte out[]) const
 
    A -= EK[3]; B -= EK[2]; C -= EK[1]; D -= EK[0];
 
-   out[ 0] = get_byte(3, D); out[ 1] = get_byte(2, D);
-   out[ 2] = get_byte(1, D); out[ 3] = get_byte(0, D);
-   out[ 4] = get_byte(3, C); out[ 5] = get_byte(2, C);
-   out[ 6] = get_byte(1, C); out[ 7] = get_byte(0, C);
-   out[ 8] = get_byte(3, B); out[ 9] = get_byte(2, B);
-   out[10] = get_byte(1, B); out[11] = get_byte(0, B);
-   out[12] = get_byte(3, A); out[13] = get_byte(2, A);
-   out[14] = get_byte(1, A); out[15] = get_byte(0, A);
+   store_le(out, D, C, B, A);
    }
 
 /*************************************************
@@ -230,7 +216,7 @@ void MARS::key(const byte key[], u32bit length)
    {
    SecureBuffer<u32bit, 15> T;
    for(u32bit j = 0; j != length / 4; ++j)
-      T[j] = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]);
+      T[j] = load_le<u32bit>(key, j);
    T[length / 4] = length / 4;
 
    for(u32bit j = 0; j != 4; ++j)
diff --git a/src/md4.cpp b/src/md4.cpp
index a0a32b179..b4cc4ce17 100644
--- a/src/md4.cpp
+++ b/src/md4.cpp
@@ -45,7 +45,7 @@ inline void HH(u32bit& A, u32bit B, u32bit C, u32bit D, u32bit M, byte S)
 void MD4::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      M[j] = make_u32bit(input[4*j+3], input[4*j+2], input[4*j+1], input[4*j]);
+      M[j] = load_le<u32bit>(input, j);
 
    u32bit A = digest[0], B = digest[1], C = digest[2], D = digest[3];
 
@@ -78,8 +78,8 @@ void MD4::hash(const byte input[])
 *************************************************/
 void MD4::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(3 - (j % 4), digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_le(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/md5.cpp b/src/md5.cpp
index 51cd7fe82..ec3703de5 100644
--- a/src/md5.cpp
+++ b/src/md5.cpp
@@ -58,7 +58,7 @@ inline void II(u32bit& A, u32bit B, u32bit C, u32bit D, u32bit msg,
 void MD5::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      M[j] = make_u32bit(input[4*j+3], input[4*j+2], input[4*j+1], input[4*j]);
+      M[j] = load_le<u32bit>(input, j);
 
    u32bit A = digest[0], B = digest[1], C = digest[2], D = digest[3];
 
@@ -106,8 +106,8 @@ void MD5::hash(const byte input[])
 *************************************************/
 void MD5::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(3 - (j % 4), digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_le(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/mdx_hash.cpp b/src/mdx_hash.cpp
index 8fbad2580..f8020e2dc 100644
--- a/src/mdx_hash.cpp
+++ b/src/mdx_hash.cpp
@@ -90,11 +90,13 @@ void MDx_HashFunction::write_count(byte out[])
    {
    if(COUNT_SIZE < 8)
       throw Invalid_State("MDx_HashFunction::write_count: COUNT_SIZE < 8");
-   for(u32bit j = 0; j != 8; ++j)
-      {
-      const u32bit choose = (BIG_BYTE_ENDIAN ? (j % 8) : (7 - (j % 8)));
-      out[j+COUNT_SIZE-8] = get_byte(choose, 8 * count);
-      }
+
+   const u64bit bit_count = count * 8;
+
+   if(BIG_BYTE_ENDIAN)
+      store_be(bit_count, out + COUNT_SIZE - 8);
+   else
+      store_le(bit_count, out + COUNT_SIZE - 8);
    }
 
 }
diff --git a/src/misty1.cpp b/src/misty1.cpp
index f0f54c476..4df3ecc76 100644
--- a/src/misty1.cpp
+++ b/src/misty1.cpp
@@ -30,8 +30,10 @@ u16bit FI(u16bit input, u16bit key7, u16bit key9)
 *************************************************/
 void MISTY1::enc(const byte in[], byte out[]) const
    {
-   u16bit B0 = make_u16bit(in[0], in[1]), B1 = make_u16bit(in[2], in[3]),
-          B2 = make_u16bit(in[4], in[5]), B3 = make_u16bit(in[6], in[7]);
+   u16bit B0 = load_be<u16bit>(in, 0);
+   u16bit B1 = load_be<u16bit>(in, 1);
+   u16bit B2 = load_be<u16bit>(in, 2);
+   u16bit B3 = load_be<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 12; j += 3)
       {
@@ -64,10 +66,7 @@ void MISTY1::enc(const byte in[], byte out[]) const
    B3 ^= B2 & EK[98];
    B2 ^= B3 | EK[99];
 
-   out[0] = get_byte(0, B2); out[1] = get_byte(1, B2);
-   out[2] = get_byte(0, B3); out[3] = get_byte(1, B3);
-   out[4] = get_byte(0, B0); out[5] = get_byte(1, B0);
-   out[6] = get_byte(0, B1); out[7] = get_byte(1, B1);
+   store_be(out, B2, B3, B0, B1);
    }
 
 /*************************************************
@@ -75,8 +74,10 @@ void MISTY1::enc(const byte in[], byte out[]) const
 *************************************************/
 void MISTY1::dec(const byte in[], byte out[]) const
    {
-   u16bit B0 = make_u16bit(in[4], in[5]), B1 = make_u16bit(in[6], in[7]),
-          B2 = make_u16bit(in[0], in[1]), B3 = make_u16bit(in[2], in[3]);
+   u16bit B0 = load_be<u16bit>(in, 2);
+   u16bit B1 = load_be<u16bit>(in, 3);
+   u16bit B2 = load_be<u16bit>(in, 0);
+   u16bit B3 = load_be<u16bit>(in, 1);
 
    for(u32bit j = 0; j != 12; j += 3)
       {
@@ -109,10 +110,7 @@ void MISTY1::dec(const byte in[], byte out[]) const
    B0 ^= B1 | DK[98];
    B1 ^= B0 & DK[99];
 
-   out[0] = get_byte(0, B0); out[1] = get_byte(1, B0);
-   out[2] = get_byte(0, B1); out[3] = get_byte(1, B1);
-   out[4] = get_byte(0, B2); out[5] = get_byte(1, B2);
-   out[6] = get_byte(0, B3); out[7] = get_byte(1, B3);
+   store_be(out, B0, B1, B2, B3);
    }
 
 /*************************************************
@@ -122,7 +120,8 @@ void MISTY1::key(const byte key[], u32bit length)
    {
    SecureBuffer<u16bit, 32> KS;
    for(u32bit j = 0; j != length / 2; ++j)
-      KS[j] = make_u16bit(key[2*j], key[2*j+1]);
+      KS[j] = load_be<u16bit>(key, j);
+
    for(u32bit j = 0; j != 8; ++j)
       {
       KS[j+ 8] = FI(KS[j], KS[(j+1) % 8] >> 9, KS[(j+1) % 8] & 0x1FF);
diff --git a/src/prf_x942.cpp b/src/prf_x942.cpp
index 6e67aab8b..fd29aecdd 100644
--- a/src/prf_x942.cpp
+++ b/src/prf_x942.cpp
@@ -21,9 +21,7 @@ namespace {
 MemoryVector<byte> encode_x942_int(u32bit n)
    {
    byte n_buf[4] = { 0 };
-   for(u32bit j = 0; j != 4; ++j)
-      n_buf[j] = get_byte(j, n);
-
+   store_be(n, n_buf);
    return DER_Encoder().encode(n_buf, 4, OCTET_STRING).get_contents();
    }
 
diff --git a/src/randpool.cpp b/src/randpool.cpp
index e041cf873..ed60b385a 100644
--- a/src/randpool.cpp
+++ b/src/randpool.cpp
@@ -9,6 +9,8 @@
 #include <botan/util.h>
 #include <algorithm>
 
+#include <assert.h>
+
 namespace Botan {
 
 namespace {
@@ -63,8 +65,7 @@ void Randpool::update_buffer()
    for(u32bit j = 0; j != counter.size(); ++j)
       if(++counter[j])
          break;
-   for(u32bit j = 0; j != 8; ++j)
-      counter[j+4] = get_byte(j, timestamp);
+   store_be(timestamp, counter + 4);
 
    SecureVector<byte> mac_val = randpool_prf(mac, GEN_OUTPUT,
                                              counter, counter.size());
diff --git a/src/rc2.cpp b/src/rc2.cpp
index fd6b4ccc6..e59e7d669 100644
--- a/src/rc2.cpp
+++ b/src/rc2.cpp
@@ -13,8 +13,10 @@ namespace Botan {
 *************************************************/
 void RC2::enc(const byte in[], byte out[]) const
    {
-   u16bit R0 = make_u16bit(in[1], in[0]), R1 = make_u16bit(in[3], in[2]),
-          R2 = make_u16bit(in[5], in[4]), R3 = make_u16bit(in[7], in[6]);
+   u16bit R0 = load_le<u16bit>(in, 0);
+   u16bit R1 = load_le<u16bit>(in, 1);
+   u16bit R2 = load_le<u16bit>(in, 2);
+   u16bit R3 = load_le<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 16; ++j)
       {
@@ -39,10 +41,7 @@ void RC2::enc(const byte in[], byte out[]) const
          }
       }
 
-   out[0] = get_byte(1, R0); out[1] = get_byte(0, R0);
-   out[2] = get_byte(1, R1); out[3] = get_byte(0, R1);
-   out[4] = get_byte(1, R2); out[5] = get_byte(0, R2);
-   out[6] = get_byte(1, R3); out[7] = get_byte(0, R3);
+   store_le(out, R0, R1, R2, R3);
    }
 
 /*************************************************
@@ -50,8 +49,10 @@ void RC2::enc(const byte in[], byte out[]) const
 *************************************************/
 void RC2::dec(const byte in[], byte out[]) const
    {
-   u16bit R0 = make_u16bit(in[1], in[0]), R1 = make_u16bit(in[3], in[2]),
-          R2 = make_u16bit(in[5], in[4]), R3 = make_u16bit(in[7], in[6]);
+   u16bit R0 = load_le<u16bit>(in, 0);
+   u16bit R1 = load_le<u16bit>(in, 1);
+   u16bit R2 = load_le<u16bit>(in, 2);
+   u16bit R3 = load_le<u16bit>(in, 3);
 
    for(u32bit j = 0; j != 16; ++j)
       {
@@ -76,10 +77,7 @@ void RC2::dec(const byte in[], byte out[]) const
          }
       }
 
-   out[0] = get_byte(1, R0); out[1] = get_byte(0, R0);
-   out[2] = get_byte(1, R1); out[3] = get_byte(0, R1);
-   out[4] = get_byte(1, R2); out[5] = get_byte(0, R2);
-   out[6] = get_byte(1, R3); out[7] = get_byte(0, R3);
+   store_le(out, R0, R1, R2, R3);
    }
 
 /*************************************************
@@ -121,7 +119,7 @@ void RC2::key(const byte key[], u32bit length)
       L[j] = TABLE[L[j+1] ^ L[j+length]];
 
    for(u32bit j = 0; j != 64; ++j)
-      K[j] = make_u16bit(L[2*j+1], L[2*j]);
+      K[j] = load_le<u16bit>(L, j);
    }
 
 /*************************************************
diff --git a/src/rc5.cpp b/src/rc5.cpp
index 261529ea8..3e87dc8ab 100644
--- a/src/rc5.cpp
+++ b/src/rc5.cpp
@@ -15,8 +15,8 @@ namespace Botan {
 *************************************************/
 void RC5::enc(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[3], in[2], in[1], in[0]),
-          B = make_u32bit(in[7], in[6], in[5], in[4]);
+   u32bit A = load_le<u32bit>(in, 0), B = load_le<u32bit>(in, 1);
+
    A += S[0]; B += S[1];
    for(u32bit j = 0; j != ROUNDS; j += 4)
       {
@@ -29,10 +29,8 @@ void RC5::enc(const byte in[], byte out[]) const
       A = rotate_left(A ^ B, B % 32) + S[2*j+8];
       B = rotate_left(B ^ A, A % 32) + S[2*j+9];
       }
-   out[0] = get_byte(3, A); out[1] = get_byte(2, A);
-   out[2] = get_byte(1, A); out[3] = get_byte(0, A);
-   out[4] = get_byte(3, B); out[5] = get_byte(2, B);
-   out[6] = get_byte(1, B); out[7] = get_byte(0, B);
+
+   store_le(out, A, B);
    }
 
 /*************************************************
@@ -40,8 +38,8 @@ void RC5::enc(const byte in[], byte out[]) const
 *************************************************/
 void RC5::dec(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[3], in[2], in[1], in[0]),
-          B = make_u32bit(in[7], in[6], in[5], in[4]);
+   u32bit A = load_le<u32bit>(in, 0), B = load_le<u32bit>(in, 1);
+
    for(u32bit j = ROUNDS; j != 0; j -= 4)
       {
       B = rotate_right(B - S[2*j+1], A % 32) ^ A;
@@ -54,10 +52,8 @@ void RC5::dec(const byte in[], byte out[]) const
       A = rotate_right(A - S[2*j-6], B % 32) ^ B;
       }
    B -= S[1]; A -= S[0];
-   out[0] = get_byte(3, A); out[1] = get_byte(2, A);
-   out[2] = get_byte(1, A); out[3] = get_byte(0, A);
-   out[4] = get_byte(3, B); out[5] = get_byte(2, B);
-   out[6] = get_byte(1, B); out[7] = get_byte(0, B);
+
+   store_le(out, A, B);
    }
 
 /*************************************************
diff --git a/src/rc6.cpp b/src/rc6.cpp
index 622cb3f16..e7c8a4725 100644
--- a/src/rc6.cpp
+++ b/src/rc6.cpp
@@ -14,10 +14,10 @@ namespace Botan {
 *************************************************/
 void RC6::enc(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          C = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          D = make_u32bit(in[15], in[14], in[13], in[12]);
+   u32bit A = load_le<u32bit>(in, 0);
+   u32bit B = load_le<u32bit>(in, 1);
+   u32bit C = load_le<u32bit>(in, 2);
+   u32bit D = load_le<u32bit>(in, 3);
 
    B += S[0]; D += S[1];
 
@@ -48,14 +48,7 @@ void RC6::enc(const byte in[], byte out[]) const
 
    A += S[42]; C += S[43];
 
-   out[ 0] = get_byte(3, A); out[ 1] = get_byte(2, A);
-   out[ 2] = get_byte(1, A); out[ 3] = get_byte(0, A);
-   out[ 4] = get_byte(3, B); out[ 5] = get_byte(2, B);
-   out[ 6] = get_byte(1, B); out[ 7] = get_byte(0, B);
-   out[ 8] = get_byte(3, C); out[ 9] = get_byte(2, C);
-   out[10] = get_byte(1, C); out[11] = get_byte(0, C);
-   out[12] = get_byte(3, D); out[13] = get_byte(2, D);
-   out[14] = get_byte(1, D); out[15] = get_byte(0, D);
+   store_le(out, A, B, C, D);
    }
 
 /*************************************************
@@ -63,10 +56,10 @@ void RC6::enc(const byte in[], byte out[]) const
 *************************************************/
 void RC6::dec(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          C = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          D = make_u32bit(in[15], in[14], in[13], in[12]);
+   u32bit A = load_le<u32bit>(in, 0);
+   u32bit B = load_le<u32bit>(in, 1);
+   u32bit C = load_le<u32bit>(in, 2);
+   u32bit D = load_le<u32bit>(in, 3);
 
    C -= S[43]; A -= S[42];
 
@@ -97,14 +90,7 @@ void RC6::dec(const byte in[], byte out[]) const
 
    D -= S[1]; B -= S[0];
 
-   out[ 0] = get_byte(3, A); out[ 1] = get_byte(2, A);
-   out[ 2] = get_byte(1, A); out[ 3] = get_byte(0, A);
-   out[ 4] = get_byte(3, B); out[ 5] = get_byte(2, B);
-   out[ 6] = get_byte(1, B); out[ 7] = get_byte(0, B);
-   out[ 8] = get_byte(3, C); out[ 9] = get_byte(2, C);
-   out[10] = get_byte(1, C); out[11] = get_byte(0, C);
-   out[12] = get_byte(3, D); out[13] = get_byte(2, D);
-   out[14] = get_byte(1, D); out[15] = get_byte(0, D);
+   store_le(out, A, B, C, D);
    }
 
 /*************************************************
diff --git a/src/rmd128.cpp b/src/rmd128.cpp
index 1614de3a2..ea7c11342 100644
--- a/src/rmd128.cpp
+++ b/src/rmd128.cpp
@@ -58,7 +58,7 @@ inline void F4(u32bit& A, u32bit B, u32bit C, u32bit D,
 void RIPEMD_128::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      M[j] = make_u32bit(input[4*j+3], input[4*j+2], input[4*j+1], input[4*j]);
+      M[j] = load_le<u32bit>(input, j);
 
    u32bit A1 = digest[0], A2 = A1, B1 = digest[1], B2 = B1,
           C1 = digest[2], C2 = C1, D1 = digest[3], D2 = D1;
@@ -145,8 +145,8 @@ void RIPEMD_128::hash(const byte input[])
 *************************************************/
 void RIPEMD_128::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(3 - (j % 4), digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_le(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/rmd160.cpp b/src/rmd160.cpp
index 6cc80d999..e092b19a2 100644
--- a/src/rmd160.cpp
+++ b/src/rmd160.cpp
@@ -73,7 +73,7 @@ inline void F5(u32bit& A, u32bit B, u32bit& C, u32bit D, u32bit E,
 void RIPEMD_160::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      M[j] = make_u32bit(input[4*j+3], input[4*j+2], input[4*j+1], input[4*j]);
+      M[j] = load_le<u32bit>(input, j);
 
    u32bit A1 = digest[0], A2 = A1, B1 = digest[1], B2 = B1,
           C1 = digest[2], C2 = C1, D1 = digest[3], D2 = D1,
@@ -179,8 +179,8 @@ void RIPEMD_160::hash(const byte input[])
 *************************************************/
 void RIPEMD_160::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(3 - (j % 4), digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_le(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/seed.cpp b/src/seed.cpp
index 84d033c06..9ed05b28f 100644
--- a/src/seed.cpp
+++ b/src/seed.cpp
@@ -22,10 +22,10 @@ u32bit SEED::G_FUNC::operator()(u32bit X) const
 *************************************************/
 void SEED::enc(const byte in[], byte out[]) const
    {
-   u32bit B0 = make_u32bit(in[ 0], in[ 1], in[ 2], in[ 3]),
-          B1 = make_u32bit(in[ 4], in[ 5], in[ 6], in[ 7]),
-          B2 = make_u32bit(in[ 8], in[ 9], in[10], in[11]),
-          B3 = make_u32bit(in[12], in[13], in[14], in[15]);
+   u32bit B0 = load_be<u32bit>(in, 0);
+   u32bit B1 = load_be<u32bit>(in, 1);
+   u32bit B2 = load_be<u32bit>(in, 2);
+   u32bit B3 = load_be<u32bit>(in, 3);
 
    G_FUNC G;
 
@@ -48,14 +48,7 @@ void SEED::enc(const byte in[], byte out[]) const
       B2 ^= T0 + T1;
       }
 
-   out[ 0] = get_byte(0, B2); out[ 1] = get_byte(1, B2);
-   out[ 2] = get_byte(2, B2); out[ 3] = get_byte(3, B2);
-   out[ 4] = get_byte(0, B3); out[ 5] = get_byte(1, B3);
-   out[ 6] = get_byte(2, B3); out[ 7] = get_byte(3, B3);
-   out[ 8] = get_byte(0, B0); out[ 9] = get_byte(1, B0);
-   out[10] = get_byte(2, B0); out[11] = get_byte(3, B0);
-   out[12] = get_byte(0, B1); out[13] = get_byte(1, B1);
-   out[14] = get_byte(2, B1); out[15] = get_byte(3, B1);
+   store_be(out, B2, B3, B0, B1);
    }
 
 /*************************************************
@@ -63,10 +56,10 @@ void SEED::enc(const byte in[], byte out[]) const
 *************************************************/
 void SEED::dec(const byte in[], byte out[]) const
    {
-   u32bit B0 = make_u32bit(in[ 0], in[ 1], in[ 2], in[ 3]),
-          B1 = make_u32bit(in[ 4], in[ 5], in[ 6], in[ 7]),
-          B2 = make_u32bit(in[ 8], in[ 9], in[10], in[11]),
-          B3 = make_u32bit(in[12], in[13], in[14], in[15]);
+   u32bit B0 = load_be<u32bit>(in, 0);
+   u32bit B1 = load_be<u32bit>(in, 1);
+   u32bit B2 = load_be<u32bit>(in, 2);
+   u32bit B3 = load_be<u32bit>(in, 3);
 
    G_FUNC G;
 
@@ -89,14 +82,7 @@ void SEED::dec(const byte in[], byte out[]) const
       B2 ^= T0 + T1;
       }
 
-   out[ 0] = get_byte(0, B2); out[ 1] = get_byte(1, B2);
-   out[ 2] = get_byte(2, B2); out[ 3] = get_byte(3, B2);
-   out[ 4] = get_byte(0, B3); out[ 5] = get_byte(1, B3);
-   out[ 6] = get_byte(2, B3); out[ 7] = get_byte(3, B3);
-   out[ 8] = get_byte(0, B0); out[ 9] = get_byte(1, B0);
-   out[10] = get_byte(2, B0); out[11] = get_byte(3, B0);
-   out[12] = get_byte(0, B1); out[13] = get_byte(1, B1);
-   out[14] = get_byte(2, B1); out[15] = get_byte(3, B1);
+   store_be(out, B2, B3, B0, B1);
    }
 
 /*************************************************
@@ -114,7 +100,7 @@ void SEED::key(const byte key[], u32bit)
    SecureBuffer<u32bit, 4> WK;
 
    for(u32bit j = 0; j != 4; ++j)
-      WK[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      WK[j] = load_be<u32bit>(key, j);
 
    G_FUNC G;
 
diff --git a/src/serpent.cpp b/src/serpent.cpp
index d17c8d0a7..6bd7132a8 100644
--- a/src/serpent.cpp
+++ b/src/serpent.cpp
@@ -242,10 +242,11 @@ inline void i_transform(u32bit& B0, u32bit& B1, u32bit& B2, u32bit& B3)
 *************************************************/
 void Serpent::enc(const byte in[], byte out[]) const
    {
-   u32bit B0 = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B1 = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          B2 = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          B3 = make_u32bit(in[15], in[14], in[13], in[12]);
+   u32bit B0 = load_le<u32bit>(in, 0);
+   u32bit B1 = load_le<u32bit>(in, 1);
+   u32bit B2 = load_le<u32bit>(in, 2);
+   u32bit B3 = load_le<u32bit>(in, 3);
+
    key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
    key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
    key_xor( 2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
@@ -278,14 +279,8 @@ void Serpent::enc(const byte in[], byte out[]) const
    key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
    key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
    key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
-   out[ 0] = get_byte(3, B0); out[ 1] = get_byte(2, B0);
-   out[ 2] = get_byte(1, B0); out[ 3] = get_byte(0, B0);
-   out[ 4] = get_byte(3, B1); out[ 5] = get_byte(2, B1);
-   out[ 6] = get_byte(1, B1); out[ 7] = get_byte(0, B1);
-   out[ 8] = get_byte(3, B2); out[ 9] = get_byte(2, B2);
-   out[10] = get_byte(1, B2); out[11] = get_byte(0, B2);
-   out[12] = get_byte(3, B3); out[13] = get_byte(2, B3);
-   out[14] = get_byte(1, B3); out[15] = get_byte(0, B3);
+
+   store_le(out, B0, B1, B2, B3);
    }
 
 /*************************************************
@@ -293,10 +288,11 @@ void Serpent::enc(const byte in[], byte out[]) const
 *************************************************/
 void Serpent::dec(const byte in[], byte out[]) const
    {
-   u32bit B0 = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B1 = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          B2 = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          B3 = make_u32bit(in[15], in[14], in[13], in[12]);
+   u32bit B0 = load_le<u32bit>(in, 0);
+   u32bit B1 = load_le<u32bit>(in, 1);
+   u32bit B2 = load_le<u32bit>(in, 2);
+   u32bit B3 = load_le<u32bit>(in, 3);
+
    key_xor(32,B0,B1,B2,B3);  SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
    i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
    i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
@@ -329,14 +325,8 @@ void Serpent::dec(const byte in[], byte out[]) const
    i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
    i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
    i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
-   out[ 0] = get_byte(3, B0); out[ 1] = get_byte(2, B0);
-   out[ 2] = get_byte(1, B0); out[ 3] = get_byte(0, B0);
-   out[ 4] = get_byte(3, B1); out[ 5] = get_byte(2, B1);
-   out[ 6] = get_byte(1, B1); out[ 7] = get_byte(0, B1);
-   out[ 8] = get_byte(3, B2); out[ 9] = get_byte(2, B2);
-   out[10] = get_byte(1, B2); out[11] = get_byte(0, B2);
-   out[12] = get_byte(3, B3); out[13] = get_byte(2, B3);
-   out[14] = get_byte(1, B3); out[15] = get_byte(0, B3);
+
+   store_le(out, B0, B1, B2, B3);
    }
 
 /*************************************************
@@ -348,7 +338,8 @@ void Serpent::key(const byte key[], u32bit length)
 
    SecureBuffer<u32bit, 140> W;
    for(u32bit j = 0; j != length / 4; ++j)
-      W[j] = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]);
+      W[j] = load_le<u32bit>(key, j);
+
    W[length / 4] |= u32bit(1) << ((length%4)*8);
    for(u32bit j = 8; j != 140; ++j)
       W[j] = rotate_left(W[j-8] ^ W[j-5] ^ W[j-3] ^ W[j-1] ^ PHI ^ (j-8), 11);
diff --git a/src/sha160.cpp b/src/sha160.cpp
index 9f6ba6960..7581f3ea0 100644
--- a/src/sha160.cpp
+++ b/src/sha160.cpp
@@ -54,7 +54,7 @@ inline void F4(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
 void SHA_160::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      W[j] = make_u32bit(input[4*j], input[4*j+1], input[4*j+2], input[4*j+3]);
+      W[j] = load_be<u32bit>(input, j);
    for(u32bit j = 16; j != 80; ++j)
       W[j] = rotate_left((W[j-3] ^ W[j-8] ^ W[j-14] ^ W[j-16]), 1);
 
@@ -102,8 +102,8 @@ void SHA_160::hash(const byte input[])
 *************************************************/
 void SHA_160::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(j % 4, digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_be(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/sha256.cpp b/src/sha256.cpp
index 1a98d4560..ae9849a57 100644
--- a/src/sha256.cpp
+++ b/src/sha256.cpp
@@ -47,7 +47,7 @@ inline void F1(u32bit A, u32bit B, u32bit C, u32bit& D,
 void SHA_256::hash(const byte input[])
    {
    for(u32bit j = 0; j != 16; ++j)
-      W[j] = make_u32bit(input[4*j], input[4*j+1], input[4*j+2], input[4*j+3]);
+      W[j] = load_be<u32bit>(input, j);
    for(u32bit j = 16; j != 64; ++j)
       W[j] = sigma(W[j- 2], 17, 19, 10) + W[j- 7] +
              sigma(W[j-15],  7, 18,  3) + W[j-16];
@@ -99,8 +99,8 @@ void SHA_256::hash(const byte input[])
 *************************************************/
 void SHA_256::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(j % 4, digest[j/4]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 4)
+      store_be(digest[j/4], output + j);
    }
 
 /*************************************************
diff --git a/src/skipjack.cpp b/src/skipjack.cpp
index 35d0e6010..969841b53 100644
--- a/src/skipjack.cpp
+++ b/src/skipjack.cpp
@@ -13,8 +13,10 @@ namespace Botan {
 *************************************************/
 void Skipjack::enc(const byte in[], byte out[]) const
    {
-   u16bit W1 = make_u16bit(in[7], in[6]), W2 = make_u16bit(in[5], in[4]),
-          W3 = make_u16bit(in[3], in[2]), W4 = make_u16bit(in[1], in[0]);
+   u16bit W1 = load_le<u16bit>(in, 3);
+   u16bit W2 = load_le<u16bit>(in, 2);
+   u16bit W3 = load_le<u16bit>(in, 1);
+   u16bit W4 = load_le<u16bit>(in, 0);
 
    step_A(W1,W4, 1); step_A(W4,W3, 2); step_A(W3,W2, 3); step_A(W2,W1, 4);
    step_A(W1,W4, 5); step_A(W4,W3, 6); step_A(W3,W2, 7); step_A(W2,W1, 8);
@@ -28,10 +30,7 @@ void Skipjack::enc(const byte in[], byte out[]) const
    step_B(W1,W2,25); step_B(W4,W1,26); step_B(W3,W4,27); step_B(W2,W3,28);
    step_B(W1,W2,29); step_B(W4,W1,30); step_B(W3,W4,31); step_B(W2,W3,32);
 
-   out[0] = get_byte(1, W4); out[1] = get_byte(0, W4);
-   out[2] = get_byte(1, W3); out[3] = get_byte(0, W3);
-   out[4] = get_byte(1, W2); out[5] = get_byte(0, W2);
-   out[6] = get_byte(1, W1); out[7] = get_byte(0, W1);
+   store_le(out, W4, W3, W2, W1);
    }
 
 /*************************************************
@@ -39,8 +38,10 @@ void Skipjack::enc(const byte in[], byte out[]) const
 *************************************************/
 void Skipjack::dec(const byte in[], byte out[]) const
    {
-   u16bit W1 = make_u16bit(in[7], in[6]), W2 = make_u16bit(in[5], in[4]),
-          W3 = make_u16bit(in[3], in[2]), W4 = make_u16bit(in[1], in[0]);
+   u16bit W1 = load_le<u16bit>(in, 3);
+   u16bit W2 = load_le<u16bit>(in, 2);
+   u16bit W3 = load_le<u16bit>(in, 1);
+   u16bit W4 = load_le<u16bit>(in, 0);
 
    step_Bi(W2,W3,32); step_Bi(W3,W4,31); step_Bi(W4,W1,30); step_Bi(W1,W2,29);
    step_Bi(W2,W3,28); step_Bi(W3,W4,27); step_Bi(W4,W1,26); step_Bi(W1,W2,25);
@@ -54,10 +55,7 @@ void Skipjack::dec(const byte in[], byte out[]) const
    step_Ai(W1,W2, 8); step_Ai(W2,W3, 7); step_Ai(W3,W4, 6); step_Ai(W4,W1, 5);
    step_Ai(W1,W2, 4); step_Ai(W2,W3, 3); step_Ai(W3,W4, 2); step_Ai(W4,W1, 1);
 
-   out[0] = get_byte(1, W4); out[1] = get_byte(0, W4);
-   out[2] = get_byte(1, W3); out[3] = get_byte(0, W3);
-   out[4] = get_byte(1, W2); out[5] = get_byte(0, W2);
-   out[6] = get_byte(1, W1); out[7] = get_byte(0, W1);
+   store_le(out, W4, W3, W2, W1);
    }
 
 /*************************************************
diff --git a/src/square.cpp b/src/square.cpp
index 7d7cf1da5..988e56ef5 100644
--- a/src/square.cpp
+++ b/src/square.cpp
@@ -117,7 +117,7 @@ void Square::key(const byte key[], u32bit)
    {
    SecureBuffer<u32bit, 36> XEK, XDK;
    for(u32bit j = 0; j != 4; ++j)
-      XEK[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      XEK[j] = load_be<u32bit>(key, j);
    for(u32bit j = 0; j != 8; ++j)
       {
       XEK[4*j+4] = XEK[4*j  ] ^ rotate_left(XEK[4*j+3], 8) ^ (0x01000000 << j);
@@ -149,27 +149,25 @@ void Square::transform(u32bit round_key[4])
       { 0x03, 0x02, 0x01, 0x01 },
       { 0x01, 0x03, 0x02, 0x01 },
       { 0x01, 0x01, 0x03, 0x02 } };
-   SecureBuffer<byte, 4> A[4], B[4];
-   for(u32bit j = 0; j != 4; ++j)
-      for(u32bit k = 0; k != 4; ++k)
-         A[j][k] = get_byte(k, round_key[j]);
+
    for(u32bit j = 0; j != 4; ++j)
+      {
+      SecureBuffer<byte, 4> A, B;
+
+      store_be(round_key[j], A);
+
       for(u32bit k = 0; k != 4; ++k)
          for(u32bit l = 0; l != 4; ++l)
-            B[j][k] ^= mul(A[j][l], G[l][k]);
-   for(u32bit j = 0; j != 4; ++j)
-      round_key[j] = make_u32bit(B[j][0], B[j][1], B[j][2], B[j][3]);
-   }
+            {
+            const byte a = A[l];
+            const byte b = G[l][k];
 
-/*************************************************
-* Multiply in GF(2^8)                            *
-*************************************************/
-byte Square::mul(byte a, byte b)
-   {
-   if(a && b)
-      return ALog[(Log[a] + Log[b]) % 255];
-   else
-      return 0;
+            if(a && b)
+               B[k] ^= ALog[(Log[a] + Log[b]) % 255];
+            }
+
+      round_key[j] = load_be<u32bit>(B.begin(), 0);
+      }
    }
 
 /*************************************************
diff --git a/src/tea.cpp b/src/tea.cpp
index 9b04aba11..aa04b1df8 100644
--- a/src/tea.cpp
+++ b/src/tea.cpp
@@ -13,8 +13,8 @@ namespace Botan {
 *************************************************/
 void TEA::enc(const byte in[], byte out[]) const
    {
-   u32bit left  = make_u32bit(in[0], in[1], in[2], in[3]),
-          right = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit left = load_be<u32bit>(in, 0), right = load_be<u32bit>(in, 1);
+
    u32bit sum = 0;
    for(u32bit j = 0; j != 32; ++j)
       {
@@ -22,10 +22,8 @@ void TEA::enc(const byte in[], byte out[]) const
       left  += ((right << 4) + K[0]) ^ (right + sum) ^ ((right >> 5) + K[1]);
       right += ((left  << 4) + K[2]) ^ (left  + sum) ^ ((left  >> 5) + K[3]);
       }
-   out[0] = get_byte(0, left);  out[1] = get_byte(1, left);
-   out[2] = get_byte(2, left);  out[3] = get_byte(3, left);
-   out[4] = get_byte(0, right); out[5] = get_byte(1, right);
-   out[6] = get_byte(2, right); out[7] = get_byte(3, right);
+
+   store_be(out, left, right);
    }
 
 /*************************************************
@@ -33,8 +31,8 @@ void TEA::enc(const byte in[], byte out[]) const
 *************************************************/
 void TEA::dec(const byte in[], byte out[]) const
    {
-   u32bit left  = make_u32bit(in[0], in[1], in[2], in[3]),
-          right = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit left = load_be<u32bit>(in, 0), right = load_be<u32bit>(in, 1);
+
    u32bit sum = 0xC6EF3720;
    for(u32bit j = 0; j != 32; ++j)
       {
@@ -42,10 +40,8 @@ void TEA::dec(const byte in[], byte out[]) const
       left  -= ((right << 4) + K[0]) ^ (right + sum) ^ ((right >> 5) + K[1]);
       sum   -= 0x9E3779B9;
       }
-   out[0] = get_byte(0, left);  out[1] = get_byte(1, left);
-   out[2] = get_byte(2, left);  out[3] = get_byte(3, left);
-   out[4] = get_byte(0, right); out[5] = get_byte(1, right);
-   out[6] = get_byte(2, right); out[7] = get_byte(3, right);
+
+   store_be(out, left, right);
    }
 
 /*************************************************
@@ -54,7 +50,7 @@ void TEA::dec(const byte in[], byte out[]) const
 void TEA::key(const byte key[], u32bit)
    {
    for(u32bit j = 0; j != 4; ++j)
-      K[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      K[j] = load_be<u32bit>(key, j);
    }
 
 }
diff --git a/src/tiger.cpp b/src/tiger.cpp
index 3df507853..a4dd657b2 100644
--- a/src/tiger.cpp
+++ b/src/tiger.cpp
@@ -15,9 +15,8 @@ namespace Botan {
 void Tiger::hash(const byte input[])
    {
    for(u32bit j = 0; j != 8; ++j)
-      X[j] = make_u64bit(input[8*j+7], input[8*j+6], input[8*j+5],
-                         input[8*j+4], input[8*j+3], input[8*j+2],
-                         input[8*j+1], input[8*j]);
+      X[j] = load_le<u64bit>(input, j);
+
    u64bit A = digest[0], B = digest[1], C = digest[2];
 
    pass(A, B, C, X, 5); mix(X);
diff --git a/src/twofish.cpp b/src/twofish.cpp
index 08d2e93a6..25359f635 100644
--- a/src/twofish.cpp
+++ b/src/twofish.cpp
@@ -13,12 +13,10 @@ namespace Botan {
 *************************************************/
 void Twofish::enc(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          C = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          D = make_u32bit(in[15], in[14], in[13], in[12]);
-
-   A ^= round_key[0]; B ^= round_key[1]; C ^= round_key[2]; D ^= round_key[3];
+   u32bit A = load_le<u32bit>(in, 0) ^ round_key[0];
+   u32bit B = load_le<u32bit>(in, 1) ^ round_key[1];
+   u32bit C = load_le<u32bit>(in, 2) ^ round_key[2];
+   u32bit D = load_le<u32bit>(in, 3) ^ round_key[3];
 
    for(u32bit j = 0; j != 16; j += 2)
       {
@@ -47,16 +45,12 @@ void Twofish::enc(const byte in[], byte out[]) const
       B = rotate_left(B, 1) ^ Y;
       }
 
-   C ^= round_key[4]; D ^= round_key[5]; A ^= round_key[6]; B ^= round_key[7];
+   C ^= round_key[4];
+   D ^= round_key[5];
+   A ^= round_key[6];
+   B ^= round_key[7];
 
-   out[ 0] = get_byte(3, C); out[ 1] = get_byte(2, C);
-   out[ 2] = get_byte(1, C); out[ 3] = get_byte(0, C);
-   out[ 4] = get_byte(3, D); out[ 5] = get_byte(2, D);
-   out[ 6] = get_byte(1, D); out[ 7] = get_byte(0, D);
-   out[ 8] = get_byte(3, A); out[ 9] = get_byte(2, A);
-   out[10] = get_byte(1, A); out[11] = get_byte(0, A);
-   out[12] = get_byte(3, B); out[13] = get_byte(2, B);
-   out[14] = get_byte(1, B); out[15] = get_byte(0, B);
+   store_le(out, C, D, A, B);
    }
 
 /*************************************************
@@ -64,12 +58,10 @@ void Twofish::enc(const byte in[], byte out[]) const
 *************************************************/
 void Twofish::dec(const byte in[], byte out[]) const
    {
-   u32bit A = make_u32bit(in[ 3], in[ 2], in[ 1], in[ 0]),
-          B = make_u32bit(in[ 7], in[ 6], in[ 5], in[ 4]),
-          C = make_u32bit(in[11], in[10], in[ 9], in[ 8]),
-          D = make_u32bit(in[15], in[14], in[13], in[12]);
-
-   A ^= round_key[4]; B ^= round_key[5]; C ^= round_key[6]; D ^= round_key[7];
+   u32bit A = load_le<u32bit>(in, 0) ^ round_key[4];
+   u32bit B = load_le<u32bit>(in, 1) ^ round_key[5];
+   u32bit C = load_le<u32bit>(in, 2) ^ round_key[6];
+   u32bit D = load_le<u32bit>(in, 3) ^ round_key[7];
 
    for(u32bit j = 0; j != 16; j += 2)
       {
@@ -100,14 +92,7 @@ void Twofish::dec(const byte in[], byte out[]) const
 
    C ^= round_key[0]; D ^= round_key[1]; A ^= round_key[2]; B ^= round_key[3];
 
-   out[ 0] = get_byte(3, C); out[ 1] = get_byte(2, C);
-   out[ 2] = get_byte(1, C); out[ 3] = get_byte(0, C);
-   out[ 4] = get_byte(3, D); out[ 5] = get_byte(2, D);
-   out[ 6] = get_byte(1, D); out[ 7] = get_byte(0, D);
-   out[ 8] = get_byte(3, A); out[ 9] = get_byte(2, A);
-   out[10] = get_byte(1, A); out[11] = get_byte(0, A);
-   out[12] = get_byte(3, B); out[13] = get_byte(2, B);
-   out[14] = get_byte(1, B); out[15] = get_byte(0, B);
+   store_le(out, C, D, A, B);
    }
 
 /*************************************************
diff --git a/src/whrlpool.cpp b/src/whrlpool.cpp
index 48cd79e5f..960095d9b 100644
--- a/src/whrlpool.cpp
+++ b/src/whrlpool.cpp
@@ -22,8 +22,7 @@ void Whirlpool::hash(const byte in[])
    };
 
    for(u32bit j = 0; j != 8; ++j)
-      M[j] = make_u64bit(in[8*j+0], in[8*j+1], in[8*j+2], in[8*j+3],
-                         in[8*j+4], in[8*j+5], in[8*j+6], in[8*j+7]);
+      M[j] = load_be<u64bit>(in, j);
 
    u64bit K0, K1, K2, K3, K4, K5, K6, K7;
    K0 = digest[0]; K1 = digest[1]; K2 = digest[2]; K3 = digest[3];
@@ -124,8 +123,8 @@ void Whirlpool::hash(const byte in[])
 *************************************************/
 void Whirlpool::copy_out(byte output[])
    {
-   for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
-      output[j] = get_byte(j % 8, digest[j/8]);
+   for(u32bit j = 0; j != OUTPUT_LENGTH; j += 8)
+      store_be(digest[j/8], output + j);
    }
 
 /*************************************************
diff --git a/src/wid_wake.cpp b/src/wid_wake.cpp
index 6002138c7..fe3fd8dab 100644
--- a/src/wid_wake.cpp
+++ b/src/wid_wake.cpp
@@ -30,15 +30,15 @@ void WiderWake_41_BE::cipher(const byte in[], byte out[], u32bit length)
 *************************************************/
 void WiderWake_41_BE::generate(u32bit length)
    {
-   u32bit R0 = state[0], R1 = state[1], R2 = state[2],
-          R3 = state[3], R4 = state[4];
+   u32bit R0 = state[0], R1 = state[1],
+          R2 = state[2], R3 = state[3],
+          R4 = state[4];
 
    for(u32bit j = 0; j != length; j += 8)
       {
       u32bit R0a;
 
-      buffer[j+0] = get_byte(0, R3); buffer[j+1] = get_byte(1, R3);
-      buffer[j+2] = get_byte(2, R3); buffer[j+3] = get_byte(3, R3);
+      store_be(R3, buffer + j);
 
       R0a = R4 + R3; R3 += R2; R2 += R1; R1 += R0;
       R0a = (R0a >> 8) ^ T[(R0a & 0xFF)];
@@ -47,8 +47,7 @@ void WiderWake_41_BE::generate(u32bit length)
       R3  = (R3  >> 8) ^ T[(R3  & 0xFF)];
       R4 = R0; R0 = R0a;
 
-      buffer[j+4] = get_byte(0, R3); buffer[j+5] = get_byte(1, R3);
-      buffer[j+6] = get_byte(2, R3); buffer[j+7] = get_byte(3, R3);
+      store_be(R3, buffer + j + 4);
 
       R0a = R4 + R3; R3 += R2; R2 += R1; R1 += R0;
       R0a = (R0a >> 8) ^ T[(R0a & 0xFF)];
@@ -57,7 +56,13 @@ void WiderWake_41_BE::generate(u32bit length)
       R3  = (R3  >> 8) ^ T[(R3  & 0xFF)];
       R4 = R0; R0 = R0a;
       }
-   state[0] = R0; state[1] = R1; state[2] = R2; state[3] = R3; state[4] = R4;
+
+   state[0] = R0;
+   state[1] = R1;
+   state[2] = R2;
+   state[3] = R3;
+   state[4] = R4;
+
    position = 0;
    }
 
@@ -67,7 +72,7 @@ void WiderWake_41_BE::generate(u32bit length)
 void WiderWake_41_BE::key(const byte key[], u32bit)
    {
    for(u32bit j = 0; j != 4; ++j)
-      t_key[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      t_key[j] = load_be<u32bit>(key, j);
 
    static const u32bit MAGIC[8] = {
       0x726A8F3B, 0xE69A3B5C, 0xD3C71FE5, 0xAB3C73D2,
@@ -116,9 +121,9 @@ void WiderWake_41_BE::resync(const byte iv[], u32bit length)
 
    for(u32bit j = 0; j != 4; ++j)
       state[j] = t_key[j];
-   state[4] = make_u32bit(iv[0], iv[1], iv[2], iv[3]);
+   state[4] = load_be<u32bit>(iv, 0);
    state[0] ^= state[4];
-   state[2] ^= make_u32bit(iv[4], iv[5], iv[6], iv[7]);
+   state[2] ^= load_be<u32bit>(iv, 1);
 
    generate(8*4);
    generate(buffer.size());
diff --git a/src/xtea.cpp b/src/xtea.cpp
index 06626cfda..5ecf3054e 100644
--- a/src/xtea.cpp
+++ b/src/xtea.cpp
@@ -14,17 +14,15 @@ namespace Botan {
 *************************************************/
 void XTEA::enc(const byte in[], byte out[]) const
    {
-   u32bit left  = make_u32bit(in[0], in[1], in[2], in[3]),
-          right = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
+
    for(u32bit j = 0; j != 32; ++j)
       {
-      left  += (((right << 4) ^ (right >> 5)) + right) ^ EK[2*j];
-      right += (((left  << 4) ^ (left  >> 5)) +  left) ^ EK[2*j+1];
+      L += (((R << 4) ^ (R >> 5)) + R) ^ EK[2*j];
+      R += (((L << 4) ^ (L >> 5)) + L) ^ EK[2*j+1];
       }
-   out[0] = get_byte(0, left);  out[1] = get_byte(1, left);
-   out[2] = get_byte(2, left);  out[3] = get_byte(3, left);
-   out[4] = get_byte(0, right); out[5] = get_byte(1, right);
-   out[6] = get_byte(2, right); out[7] = get_byte(3, right);
+
+   store_be(out, L, R);
    }
 
 /*************************************************
@@ -32,17 +30,15 @@ void XTEA::enc(const byte in[], byte out[]) const
 *************************************************/
 void XTEA::dec(const byte in[], byte out[]) const
    {
-   u32bit left  = make_u32bit(in[0], in[1], in[2], in[3]),
-          right = make_u32bit(in[4], in[5], in[6], in[7]);
+   u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
+
    for(u32bit j = 32; j > 0; --j)
       {
-      right -= (((left  << 4) ^ (left  >> 5)) +  left) ^ EK[2*j - 1];
-      left  -= (((right << 4) ^ (right >> 5)) + right) ^ EK[2*j - 2];
+      R -= (((L << 4) ^ (L >> 5)) + L) ^ EK[2*j - 1];
+      L -= (((R << 4) ^ (R >> 5)) + R) ^ EK[2*j - 2];
       }
-   out[0] = get_byte(0, left);  out[1] = get_byte(1, left);
-   out[2] = get_byte(2, left);  out[3] = get_byte(3, left);
-   out[4] = get_byte(0, right); out[5] = get_byte(1, right);
-   out[6] = get_byte(2, right); out[7] = get_byte(3, right);
+
+   store_be(out, L, R);
    }
 
 /*************************************************
@@ -73,7 +69,8 @@ void XTEA::key(const byte key[], u32bit)
 
    SecureBuffer<u32bit, 4> UK;
    for(u32bit j = 0; j != 4; ++j)
-      UK[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      UK[j] = load_be<u32bit>(key, j);
+
    for(u32bit j = 0; j != 64; ++j)
       EK[j] = DELTAS[j] + UK[KEY_INDEX[j]];
    }
author	lloyd <[email protected]>	2007-05-31 03:25:19 +0000
committer	lloyd <[email protected]>	2007-05-31 03:25:19 +0000
commit	55608e7dd1aa593944f967f2549564e4f42b654e (patch)
tree	ec2ec03a762a6dac82eb608487d5394370135624
parent	22ecdc45a0efa4c444d0b7010b7cd743aeb68c57 (diff)