1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
/*
* Byte Swapping Operations
* (C) 1999-2011 Jack Lloyd
* (C) 2007 Yves Jerschow
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
#ifndef BOTAN_BYTE_SWAP_H_
#define BOTAN_BYTE_SWAP_H_
#include <botan/types.h>
#include <botan/rotate.h>
#if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS)
#include <emmintrin.h>
#endif
namespace Botan {
/**
* Swap a 16 bit integer
*/
inline uint16_t reverse_bytes(uint16_t val)
{
return rotate_left(val, 8);
}
/**
* Swap a 32 bit integer
*/
inline uint32_t reverse_bytes(uint32_t val)
{
#if BOTAN_GCC_VERSION >= 430 && !defined(BOTAN_TARGET_ARCH_IS_ARM32)
/*
GCC intrinsic added in 4.3, works for a number of CPUs
However avoid under ARM, as it branches to a function in libgcc
instead of generating inline asm, so slower even than the generic
rotate version below.
*/
return __builtin_bswap32(val);
#elif defined(BOTAN_USE_GCC_INLINE_ASM) && defined(BOTAN_TARGET_CPU_IS_X86_FAMILY)
// GCC-style inline assembly for x86 or x86-64
asm("bswapl %0" : "=r" (val) : "0" (val));
return val;
#elif defined(BOTAN_USE_GCC_INLINE_ASM) && defined(BOTAN_TARGET_ARCH_IS_ARM32)
asm ("eor r3, %1, %1, ror #16\n\t"
"bic r3, r3, #0x00FF0000\n\t"
"mov %0, %1, ror #8\n\t"
"eor %0, %0, r3, lsr #8"
: "=r" (val)
: "0" (val)
: "r3", "cc");
return val;
#elif defined(_MSC_VER) && defined(BOTAN_TARGET_ARCH_IS_X86_32)
// Visual C++ inline asm for 32-bit x86, by Yves Jerschow
__asm mov eax, val;
__asm bswap eax;
#else
// Generic implementation
return (rotate_right(val, 8) & 0xFF00FF00) |
(rotate_left (val, 8) & 0x00FF00FF);
#endif
}
/**
* Swap a 64 bit integer
*/
inline uint64_t reverse_bytes(uint64_t val)
{
#if BOTAN_GCC_VERSION >= 430
// GCC intrinsic added in 4.3, works for a number of CPUs
return __builtin_bswap64(val);
#elif defined(BOTAN_USE_GCC_INLINE_ASM) && defined(BOTAN_TARGET_ARCH_IS_X86_64)
// GCC-style inline assembly for x86-64
asm("bswapq %0" : "=r" (val) : "0" (val));
return val;
#else
/* Generic implementation. Defined in terms of 32-bit bswap so any
* optimizations in that version can help here (particularly
* useful for 32-bit x86).
*/
uint32_t hi = static_cast<uint32_t>(val >> 32);
uint32_t lo = static_cast<uint32_t>(val);
hi = reverse_bytes(hi);
lo = reverse_bytes(lo);
return (static_cast<uint64_t>(lo) << 32) | hi;
#endif
}
/**
* Swap 4 Ts in an array
*/
template<typename T>
inline void bswap_4(T x[4])
{
x[0] = reverse_bytes(x[0]);
x[1] = reverse_bytes(x[1]);
x[2] = reverse_bytes(x[2]);
x[3] = reverse_bytes(x[3]);
}
#if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS)
/**
* Swap 4 uint32_ts in an array using SSE2 shuffle instructions
*/
template<>
inline void bswap_4(uint32_t x[4])
{
__m128i T = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
T = _mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(x), T);
}
#endif
}
#endif
|